blob: 4d3a1e02130b5cae9b36056a91ff9d63418b9f5f [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
85 u64 snap_seq;
86 u32 total_snaps;
87
88 char *snap_names;
89 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090
91 u64 obj_version;
92};
93
94struct rbd_options {
95 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096};
97
98/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060099 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700100 */
101struct rbd_client {
102 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700103 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104 struct kref kref;
105 struct list_head node;
106};
107
108/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600109 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700110 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700111struct rbd_req_status {
112 int done;
113 int rc;
114 u64 bytes;
115};
116
117/*
118 * a collection of requests
119 */
120struct rbd_req_coll {
121 int total;
122 int num_done;
123 struct kref kref;
124 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700125};
126
Alex Elderf0f8cef2012-01-29 13:57:44 -0600127/*
128 * a single io request
129 */
130struct rbd_request {
131 struct request *rq; /* blk layer request */
132 struct bio *bio; /* cloned bio */
133 struct page **pages; /* list of used pages */
134 u64 len;
135 int coll_index;
136 struct rbd_req_coll *coll;
137};
138
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800139struct rbd_snap {
140 struct device dev;
141 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800142 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800143 struct list_head node;
144 u64 id;
145};
146
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700147/*
148 * a single device
149 */
150struct rbd_device {
151 int id; /* blkdev unique id */
152
153 int major; /* blkdev assigned major */
154 struct gendisk *disk; /* blkdev's gendisk and rq */
155 struct request_queue *q;
156
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157 struct rbd_client *rbd_client;
158
159 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
160
161 spinlock_t lock; /* queue lock */
162
163 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500164 char *image_name;
165 size_t image_name_len;
166 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500167 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500168 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700170 struct ceph_osd_event *watch_event;
171 struct ceph_osd_request *watch_request;
172
Josh Durginc6666012011-11-21 17:11:12 -0800173 /* protects updating the header */
174 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800175 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500176 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800177 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800178 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800179 /* whether the snap_id this device reads from still exists */
180 bool snap_exists;
181 int read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
183 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800184
185 /* list of snapshots */
186 struct list_head snaps;
187
188 /* sysfs related */
189 struct device dev;
190};
191
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700192static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600193
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700194static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600195static DEFINE_SPINLOCK(rbd_dev_list_lock);
196
Alex Elder432b8582012-01-29 13:57:44 -0600197static LIST_HEAD(rbd_client_list); /* clients */
198static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800200static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
201static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800202static ssize_t rbd_snap_add(struct device *dev,
203 struct device_attribute *attr,
204 const char *buf,
205 size_t count);
206static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700207 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800208
Alex Elderf0f8cef2012-01-29 13:57:44 -0600209static ssize_t rbd_add(struct bus_type *bus, const char *buf,
210 size_t count);
211static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
212 size_t count);
213
214static struct bus_attribute rbd_bus_attrs[] = {
215 __ATTR(add, S_IWUSR, NULL, rbd_add),
216 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
217 __ATTR_NULL
218};
219
220static struct bus_type rbd_bus_type = {
221 .name = "rbd",
222 .bus_attrs = rbd_bus_attrs,
223};
224
225static void rbd_root_dev_release(struct device *dev)
226{
227}
228
229static struct device rbd_root_dev = {
230 .init_name = "rbd",
231 .release = rbd_root_dev_release,
232};
233
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
236{
237 return get_device(&rbd_dev->dev);
238}
239
240static void rbd_put_dev(struct rbd_device *rbd_dev)
241{
242 put_device(&rbd_dev->dev);
243}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700244
Josh Durgin263c6ca2011-12-05 10:43:42 -0800245static int __rbd_refresh_header(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700246
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700247static int rbd_open(struct block_device *bdev, fmode_t mode)
248{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600249 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800251 rbd_get_dev(rbd_dev);
252
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700253 set_device_ro(bdev, rbd_dev->read_only);
254
255 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
256 return -EROFS;
257
258 return 0;
259}
260
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800261static int rbd_release(struct gendisk *disk, fmode_t mode)
262{
263 struct rbd_device *rbd_dev = disk->private_data;
264
265 rbd_put_dev(rbd_dev);
266
267 return 0;
268}
269
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270static const struct block_device_operations rbd_bd_ops = {
271 .owner = THIS_MODULE,
272 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800273 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700274};
275
276/*
277 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500278 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279 */
Alex Elder43ae4702012-07-03 16:01:18 -0500280static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700281 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700282{
283 struct rbd_client *rbdc;
284 int ret = -ENOMEM;
285
286 dout("rbd_client_create\n");
287 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
288 if (!rbdc)
289 goto out_opt;
290
291 kref_init(&rbdc->kref);
292 INIT_LIST_HEAD(&rbdc->node);
293
Alex Elderbc534d82012-01-29 13:57:44 -0600294 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
295
Alex Elder43ae4702012-07-03 16:01:18 -0500296 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600298 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500299 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700300
301 ret = ceph_open_session(rbdc->client);
302 if (ret < 0)
303 goto out_err;
304
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700305 rbdc->rbd_opts = rbd_opts;
306
Alex Elder432b8582012-01-29 13:57:44 -0600307 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700308 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600309 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310
Alex Elderbc534d82012-01-29 13:57:44 -0600311 mutex_unlock(&ctl_mutex);
312
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700313 dout("rbd_client_create created %p\n", rbdc);
314 return rbdc;
315
316out_err:
317 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600318out_mutex:
319 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700320 kfree(rbdc);
321out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500322 if (ceph_opts)
323 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400324 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700325}
326
327/*
328 * Find a ceph client with specific addr and configuration.
329 */
Alex Elder43ae4702012-07-03 16:01:18 -0500330static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700331{
332 struct rbd_client *client_node;
333
Alex Elder43ae4702012-07-03 16:01:18 -0500334 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700335 return NULL;
336
337 list_for_each_entry(client_node, &rbd_client_list, node)
Alex Elder43ae4702012-07-03 16:01:18 -0500338 if (!ceph_compare_options(ceph_opts, client_node->client))
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 return client_node;
340 return NULL;
341}
342
343/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700344 * mount options
345 */
346enum {
347 Opt_notify_timeout,
348 Opt_last_int,
349 /* int args above */
350 Opt_last_string,
351 /* string args above */
352};
353
Alex Elder43ae4702012-07-03 16:01:18 -0500354static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700355 {Opt_notify_timeout, "notify_timeout=%d"},
356 /* int args above */
357 /* string args above */
358 {-1, NULL}
359};
360
361static int parse_rbd_opts_token(char *c, void *private)
362{
Alex Elder43ae4702012-07-03 16:01:18 -0500363 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700364 substring_t argstr[MAX_OPT_ARGS];
365 int token, intval, ret;
366
Alex Elder43ae4702012-07-03 16:01:18 -0500367 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700368 if (token < 0)
369 return -EINVAL;
370
371 if (token < Opt_last_int) {
372 ret = match_int(&argstr[0], &intval);
373 if (ret < 0) {
374 pr_err("bad mount option arg (not int) "
375 "at '%s'\n", c);
376 return ret;
377 }
378 dout("got int token %d val %d\n", token, intval);
379 } else if (token > Opt_last_int && token < Opt_last_string) {
380 dout("got string token %d val %s\n", token,
381 argstr[0].from);
382 } else {
383 dout("got token %d\n", token);
384 }
385
386 switch (token) {
387 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500388 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700389 break;
390 default:
391 BUG_ON(token);
392 }
393 return 0;
394}
395
396/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700397 * Get a ceph client with specific addr and configuration, if one does
398 * not exist create it.
399 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600400static struct rbd_client *rbd_get_client(const char *mon_addr,
401 size_t mon_addr_len,
402 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700403{
404 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500405 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700406 struct rbd_options *rbd_opts;
407
408 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
409 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600410 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700411
412 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700413
Alex Elder43ae4702012-07-03 16:01:18 -0500414 ceph_opts = ceph_parse_options(options, mon_addr,
415 mon_addr + mon_addr_len,
416 parse_rbd_opts_token, rbd_opts);
417 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600418 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500419 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600420 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421
Alex Elder432b8582012-01-29 13:57:44 -0600422 spin_lock(&rbd_client_list_lock);
Alex Elder43ae4702012-07-03 16:01:18 -0500423 rbdc = __rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700424 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600425 /* using an existing client */
426 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600427 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600428
Alex Elder43ae4702012-07-03 16:01:18 -0500429 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600430 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431
Alex Elderd720bcb2012-02-02 08:13:30 -0600432 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433 }
Alex Elder432b8582012-01-29 13:57:44 -0600434 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435
Alex Elder43ae4702012-07-03 16:01:18 -0500436 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600437
Alex Elderd720bcb2012-02-02 08:13:30 -0600438 if (IS_ERR(rbdc))
439 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440
Alex Elderd720bcb2012-02-02 08:13:30 -0600441 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700442}
443
444/*
445 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600446 *
Alex Elder432b8582012-01-29 13:57:44 -0600447 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700448 */
449static void rbd_client_release(struct kref *kref)
450{
451 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
452
453 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500454 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700455 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500456 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457
458 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700459 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700460 kfree(rbdc);
461}
462
463/*
464 * Drop reference to ceph client node. If it's not referenced anymore, release
465 * it.
466 */
467static void rbd_put_client(struct rbd_device *rbd_dev)
468{
469 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
470 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471}
472
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700473/*
474 * Destroy requests collection
475 */
476static void rbd_coll_release(struct kref *kref)
477{
478 struct rbd_req_coll *coll =
479 container_of(kref, struct rbd_req_coll, kref);
480
481 dout("rbd_coll_release %p\n", coll);
482 kfree(coll);
483}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700484
485/*
486 * Create a new header structure, translate header format from the on-disk
487 * header.
488 */
489static int rbd_header_from_disk(struct rbd_image_header *header,
490 struct rbd_image_header_ondisk *ondisk,
Xi Wang50f7c4c2012-04-20 15:49:44 -0500491 u32 allocated_snaps,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700492 gfp_t gfp_flags)
493{
Xi Wang50f7c4c2012-04-20 15:49:44 -0500494 u32 i, snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495
Alex Elder21079782012-01-24 10:08:36 -0600496 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800497 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800498
Alex Elder00f1f362012-02-07 12:03:36 -0600499 snap_count = le32_to_cpu(ondisk->snap_count);
Xi Wang50f7c4c2012-04-20 15:49:44 -0500500 if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
501 / sizeof (*ondisk))
502 return -EINVAL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Yan, Zhengf9f9a192012-06-06 09:15:33 -0500504 snap_count * sizeof(u64),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700505 gfp_flags);
506 if (!header->snapc)
507 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600508
Alex Elder00f1f362012-02-07 12:03:36 -0600509 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700510 if (snap_count) {
511 header->snap_names = kmalloc(header->snap_names_len,
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500512 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700513 if (!header->snap_names)
514 goto err_snapc;
515 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500516 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700517 if (!header->snap_sizes)
518 goto err_names;
519 } else {
520 header->snap_names = NULL;
521 header->snap_sizes = NULL;
522 }
Alex Elder849b4262012-07-09 21:04:24 -0500523
524 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
525 gfp_flags);
526 if (!header->object_prefix)
527 goto err_sizes;
528
Alex Elderca1e49a2012-07-10 20:30:09 -0500529 memcpy(header->object_prefix, ondisk->block_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700530 sizeof(ondisk->block_name));
Alex Elder849b4262012-07-09 21:04:24 -0500531 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700532
533 header->image_size = le64_to_cpu(ondisk->image_size);
534 header->obj_order = ondisk->options.order;
535 header->crypt_type = ondisk->options.crypt_type;
536 header->comp_type = ondisk->options.comp_type;
537
538 atomic_set(&header->snapc->nref, 1);
539 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
540 header->snapc->num_snaps = snap_count;
541 header->total_snaps = snap_count;
542
Alex Elder21079782012-01-24 10:08:36 -0600543 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700544 for (i = 0; i < snap_count; i++) {
545 header->snapc->snaps[i] =
546 le64_to_cpu(ondisk->snaps[i].id);
547 header->snap_sizes[i] =
548 le64_to_cpu(ondisk->snaps[i].image_size);
549 }
550
551 /* copy snapshot names */
552 memcpy(header->snap_names, &ondisk->snaps[i],
553 header->snap_names_len);
554 }
555
556 return 0;
557
Alex Elder849b4262012-07-09 21:04:24 -0500558err_sizes:
559 kfree(header->snap_sizes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700560err_names:
561 kfree(header->snap_names);
562err_snapc:
563 kfree(header->snapc);
Alex Elder00f1f362012-02-07 12:03:36 -0600564 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565}
566
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700567static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
568 u64 *seq, u64 *size)
569{
570 int i;
571 char *p = header->snap_names;
572
Alex Elder00f1f362012-02-07 12:03:36 -0600573 for (i = 0; i < header->total_snaps; i++) {
574 if (!strcmp(snap_name, p)) {
575
576 /* Found it. Pass back its id and/or size */
577
578 if (seq)
579 *seq = header->snapc->snaps[i];
580 if (size)
581 *size = header->snap_sizes[i];
582 return i;
583 }
584 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 }
Alex Elder00f1f362012-02-07 12:03:36 -0600586 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587}
588
Alex Elder0ce1a792012-07-03 16:01:18 -0500589static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590{
Alex Elder0ce1a792012-07-03 16:01:18 -0500591 struct rbd_image_header *header = &rbd_dev->header;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592 struct ceph_snap_context *snapc = header->snapc;
593 int ret = -ENOENT;
594
Alex Elder0ce1a792012-07-03 16:01:18 -0500595 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700596
Alex Elder0ce1a792012-07-03 16:01:18 -0500597 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800598 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599 if (header->total_snaps)
600 snapc->seq = header->snap_seq;
601 else
602 snapc->seq = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -0500603 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800604 rbd_dev->snap_exists = false;
Alex Elder0ce1a792012-07-03 16:01:18 -0500605 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606 if (size)
607 *size = header->image_size;
608 } else {
Alex Elder0ce1a792012-07-03 16:01:18 -0500609 ret = snap_by_name(header, rbd_dev->snap_name,
610 &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700611 if (ret < 0)
612 goto done;
Alex Elder0ce1a792012-07-03 16:01:18 -0500613 rbd_dev->snap_id = snapc->seq;
Josh Durgine88a36e2011-11-21 18:14:25 -0800614 rbd_dev->snap_exists = true;
Alex Elder0ce1a792012-07-03 16:01:18 -0500615 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616 }
617
618 ret = 0;
619done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500620 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621 return ret;
622}
623
624static void rbd_header_free(struct rbd_image_header *header)
625{
Alex Elder849b4262012-07-09 21:04:24 -0500626 kfree(header->object_prefix);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700627 kfree(header->snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -0500628 kfree(header->snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -0800629 ceph_put_snap_context(header->snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700630}
631
632/*
633 * get the actual striped segment name, offset and length
634 */
635static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500636 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700637 u64 ofs, u64 len,
638 char *seg_name, u64 *segofs)
639{
640 u64 seg = ofs >> header->obj_order;
641
642 if (seg_name)
643 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500644 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700645
646 ofs = ofs & ((1 << header->obj_order) - 1);
647 len = min_t(u64, len, (1 << header->obj_order) - ofs);
648
649 if (segofs)
650 *segofs = ofs;
651
652 return len;
653}
654
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700655static int rbd_get_num_segments(struct rbd_image_header *header,
656 u64 ofs, u64 len)
657{
658 u64 start_seg = ofs >> header->obj_order;
659 u64 end_seg = (ofs + len - 1) >> header->obj_order;
660 return end_seg - start_seg + 1;
661}
662
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700664 * returns the size of an object in the image
665 */
666static u64 rbd_obj_bytes(struct rbd_image_header *header)
667{
668 return 1 << header->obj_order;
669}
670
671/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700672 * bio helpers
673 */
674
675static void bio_chain_put(struct bio *chain)
676{
677 struct bio *tmp;
678
679 while (chain) {
680 tmp = chain;
681 chain = chain->bi_next;
682 bio_put(tmp);
683 }
684}
685
686/*
687 * zeros a bio chain, starting at specific offset
688 */
689static void zero_bio_chain(struct bio *chain, int start_ofs)
690{
691 struct bio_vec *bv;
692 unsigned long flags;
693 void *buf;
694 int i;
695 int pos = 0;
696
697 while (chain) {
698 bio_for_each_segment(bv, chain, i) {
699 if (pos + bv->bv_len > start_ofs) {
700 int remainder = max(start_ofs - pos, 0);
701 buf = bvec_kmap_irq(bv, &flags);
702 memset(buf + remainder, 0,
703 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200704 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705 }
706 pos += bv->bv_len;
707 }
708
709 chain = chain->bi_next;
710 }
711}
712
713/*
714 * bio_chain_clone - clone a chain of bios up to a certain length.
715 * might return a bio_pair that will need to be released.
716 */
717static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
718 struct bio_pair **bp,
719 int len, gfp_t gfpmask)
720{
721 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
722 int total = 0;
723
724 if (*bp) {
725 bio_pair_release(*bp);
726 *bp = NULL;
727 }
728
729 while (old_chain && (total < len)) {
730 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
731 if (!tmp)
732 goto err_out;
733
734 if (total + old_chain->bi_size > len) {
735 struct bio_pair *bp;
736
737 /*
738 * this split can only happen with a single paged bio,
739 * split_bio will BUG_ON if this is not the case
740 */
741 dout("bio_chain_clone split! total=%d remaining=%d"
742 "bi_size=%d\n",
743 (int)total, (int)len-total,
744 (int)old_chain->bi_size);
745
746 /* split the bio. We'll release it either in the next
747 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600748 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700749 if (!bp)
750 goto err_out;
751
752 __bio_clone(tmp, &bp->bio1);
753
754 *next = &bp->bio2;
755 } else {
756 __bio_clone(tmp, old_chain);
757 *next = old_chain->bi_next;
758 }
759
760 tmp->bi_bdev = NULL;
761 gfpmask &= ~__GFP_WAIT;
762 tmp->bi_next = NULL;
763
764 if (!new_chain) {
765 new_chain = tail = tmp;
766 } else {
767 tail->bi_next = tmp;
768 tail = tmp;
769 }
770 old_chain = old_chain->bi_next;
771
772 total += tmp->bi_size;
773 }
774
775 BUG_ON(total < len);
776
777 if (tail)
778 tail->bi_next = NULL;
779
780 *old = old_chain;
781
782 return new_chain;
783
784err_out:
785 dout("bio_chain_clone with err\n");
786 bio_chain_put(new_chain);
787 return NULL;
788}
789
790/*
791 * helpers for osd request op vectors.
792 */
793static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
794 int num_ops,
795 int opcode,
796 u32 payload_len)
797{
798 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
799 GFP_NOIO);
800 if (!*ops)
801 return -ENOMEM;
802 (*ops)[0].op = opcode;
803 /*
804 * op extent offset and length will be set later on
805 * in calc_raw_layout()
806 */
807 (*ops)[0].payload_len = payload_len;
808 return 0;
809}
810
811static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
812{
813 kfree(ops);
814}
815
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700816static void rbd_coll_end_req_index(struct request *rq,
817 struct rbd_req_coll *coll,
818 int index,
819 int ret, u64 len)
820{
821 struct request_queue *q;
822 int min, max, i;
823
824 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
825 coll, index, ret, len);
826
827 if (!rq)
828 return;
829
830 if (!coll) {
831 blk_end_request(rq, ret, len);
832 return;
833 }
834
835 q = rq->q;
836
837 spin_lock_irq(q->queue_lock);
838 coll->status[index].done = 1;
839 coll->status[index].rc = ret;
840 coll->status[index].bytes = len;
841 max = min = coll->num_done;
842 while (max < coll->total && coll->status[max].done)
843 max++;
844
845 for (i = min; i<max; i++) {
846 __blk_end_request(rq, coll->status[i].rc,
847 coll->status[i].bytes);
848 coll->num_done++;
849 kref_put(&coll->kref, rbd_coll_release);
850 }
851 spin_unlock_irq(q->queue_lock);
852}
853
854static void rbd_coll_end_req(struct rbd_request *req,
855 int ret, u64 len)
856{
857 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
858}
859
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700860/*
861 * Send ceph osd request
862 */
863static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500864 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865 struct ceph_snap_context *snapc,
866 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500867 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700868 struct bio *bio,
869 struct page **pages,
870 int num_pages,
871 int flags,
872 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700873 struct rbd_req_coll *coll,
874 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700875 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700876 struct ceph_msg *msg),
877 struct ceph_osd_request **linger_req,
878 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700879{
880 struct ceph_osd_request *req;
881 struct ceph_file_layout *layout;
882 int ret;
883 u64 bno;
884 struct timespec mtime = CURRENT_TIME;
885 struct rbd_request *req_data;
886 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600887 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700890 if (!req_data) {
891 if (coll)
892 rbd_coll_end_req_index(rq, coll, coll_index,
893 -ENOMEM, len);
894 return -ENOMEM;
895 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700897 if (coll) {
898 req_data->coll = coll;
899 req_data->coll_index = coll_index;
900 }
901
Alex Elderaded07e2012-07-03 16:01:18 -0500902 dout("rbd_do_request object_name=%s ofs=%lld len=%lld\n",
903 object_name, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700904
Alex Elder0ce1a792012-07-03 16:01:18 -0500905 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600906 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
907 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700908 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700909 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700910 goto done_pages;
911 }
912
913 req->r_callback = rbd_cb;
914
915 req_data->rq = rq;
916 req_data->bio = bio;
917 req_data->pages = pages;
918 req_data->len = len;
919
920 req->r_priv = req_data;
921
922 reqhead = req->r_request->front.iov_base;
923 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
924
Alex Elderaded07e2012-07-03 16:01:18 -0500925 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700926 req->r_oid_len = strlen(req->r_oid);
927
928 layout = &req->r_file_layout;
929 memset(layout, 0, sizeof(*layout));
930 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
931 layout->fl_stripe_count = cpu_to_le32(1);
932 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500933 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600934 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
935 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700936
937 ceph_osdc_build_request(req, ofs, &len,
938 ops,
939 snapc,
940 &mtime,
941 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700942
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700943 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600944 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700945 *linger_req = req;
946 }
947
Alex Elder1dbb4392012-01-24 10:08:37 -0600948 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949 if (ret < 0)
950 goto done_err;
951
952 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600953 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700954 if (ver)
955 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700956 dout("reassert_ver=%lld\n",
957 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958 ceph_osdc_put_request(req);
959 }
960 return ret;
961
962done_err:
963 bio_chain_put(req_data->bio);
964 ceph_osdc_put_request(req);
965done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700966 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 return ret;
969}
970
971/*
972 * Ceph osd op callback
973 */
974static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
975{
976 struct rbd_request *req_data = req->r_priv;
977 struct ceph_osd_reply_head *replyhead;
978 struct ceph_osd_op *op;
979 __s32 rc;
980 u64 bytes;
981 int read_op;
982
983 /* parse reply */
984 replyhead = msg->front.iov_base;
985 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
986 op = (void *)(replyhead + 1);
987 rc = le32_to_cpu(replyhead->result);
988 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -0500989 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700990
991 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
992
993 if (rc == -ENOENT && read_op) {
994 zero_bio_chain(req_data->bio, 0);
995 rc = 0;
996 } else if (rc == 0 && read_op && bytes < req_data->len) {
997 zero_bio_chain(req_data->bio, bytes);
998 bytes = req_data->len;
999 }
1000
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001001 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001002
1003 if (req_data->bio)
1004 bio_chain_put(req_data->bio);
1005
1006 ceph_osdc_put_request(req);
1007 kfree(req_data);
1008}
1009
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001010static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1011{
1012 ceph_osdc_put_request(req);
1013}
1014
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015/*
1016 * Do a synchronous ceph osd operation
1017 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001018static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001019 struct ceph_snap_context *snapc,
1020 u64 snapid,
1021 int opcode,
1022 int flags,
1023 struct ceph_osd_req_op *orig_ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001024 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001025 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001026 char *buf,
1027 struct ceph_osd_request **linger_req,
1028 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001029{
1030 int ret;
1031 struct page **pages;
1032 int num_pages;
1033 struct ceph_osd_req_op *ops = orig_ops;
1034 u32 payload_len;
1035
1036 num_pages = calc_pages_for(ofs , len);
1037 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001038 if (IS_ERR(pages))
1039 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040
1041 if (!orig_ops) {
1042 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1043 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1044 if (ret < 0)
1045 goto done;
1046
1047 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1048 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1049 if (ret < 0)
1050 goto done_ops;
1051 }
1052 }
1053
Alex Elder0ce1a792012-07-03 16:01:18 -05001054 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001055 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056 pages, num_pages,
1057 flags,
1058 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001059 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001060 NULL,
1061 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001062 if (ret < 0)
1063 goto done_ops;
1064
1065 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1066 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1067
1068done_ops:
1069 if (!orig_ops)
1070 rbd_destroy_ops(ops);
1071done:
1072 ceph_release_page_vector(pages, num_pages);
1073 return ret;
1074}
1075
1076/*
1077 * Do an asynchronous ceph osd operation
1078 */
1079static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001080 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001081 struct ceph_snap_context *snapc,
1082 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001083 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001085 struct bio *bio,
1086 struct rbd_req_coll *coll,
1087 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088{
1089 char *seg_name;
1090 u64 seg_ofs;
1091 u64 seg_len;
1092 int ret;
1093 struct ceph_osd_req_op *ops;
1094 u32 payload_len;
1095
1096 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1097 if (!seg_name)
1098 return -ENOMEM;
1099
1100 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001101 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 ofs, len,
1103 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104
1105 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1106
1107 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1108 if (ret < 0)
1109 goto done;
1110
1111 /* we've taken care of segment sizes earlier when we
1112 cloned the bios. We should never have a segment
1113 truncated at this point */
1114 BUG_ON(seg_len < len);
1115
1116 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1117 seg_name, seg_ofs, seg_len,
1118 bio,
1119 NULL, 0,
1120 flags,
1121 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001122 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001123 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001124
1125 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001126done:
1127 kfree(seg_name);
1128 return ret;
1129}
1130
1131/*
1132 * Request async osd write
1133 */
1134static int rbd_req_write(struct request *rq,
1135 struct rbd_device *rbd_dev,
1136 struct ceph_snap_context *snapc,
1137 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001138 struct bio *bio,
1139 struct rbd_req_coll *coll,
1140 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001141{
1142 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1143 CEPH_OSD_OP_WRITE,
1144 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001145 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001146}
1147
1148/*
1149 * Request async osd read
1150 */
1151static int rbd_req_read(struct request *rq,
1152 struct rbd_device *rbd_dev,
1153 u64 snapid,
1154 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001155 struct bio *bio,
1156 struct rbd_req_coll *coll,
1157 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001158{
1159 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001160 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161 CEPH_OSD_OP_READ,
1162 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001163 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164}
1165
1166/*
1167 * Request sync osd read
1168 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001169static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170 struct ceph_snap_context *snapc,
1171 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001172 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001174 char *buf,
1175 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001176{
Alex Elder0ce1a792012-07-03 16:01:18 -05001177 return rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001178 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179 CEPH_OSD_OP_READ,
1180 CEPH_OSD_FLAG_READ,
1181 NULL,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001182 object_name, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001183}
1184
1185/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001186 * Request sync osd watch
1187 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001188static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001189 u64 ver,
1190 u64 notify_id,
Alex Elderaded07e2012-07-03 16:01:18 -05001191 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001192{
1193 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001194 int ret;
1195
1196 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001197 if (ret < 0)
1198 return ret;
1199
Josh Durgina71b8912011-12-05 18:10:44 -08001200 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001201 ops[0].watch.cookie = notify_id;
1202 ops[0].watch.flag = 0;
1203
Alex Elder0ce1a792012-07-03 16:01:18 -05001204 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderaded07e2012-07-03 16:01:18 -05001205 object_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001206 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001207 CEPH_OSD_FLAG_READ,
1208 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001209 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001210 rbd_simple_req_cb, 0, NULL);
1211
1212 rbd_destroy_ops(ops);
1213 return ret;
1214}
1215
1216static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1217{
Alex Elder0ce1a792012-07-03 16:01:18 -05001218 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001219 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001220 int rc;
1221
Alex Elder0ce1a792012-07-03 16:01:18 -05001222 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001223 return;
1224
Alex Elder0bed54d2012-07-03 16:01:18 -05001225 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n",
1226 rbd_dev->header_name, notify_id, (int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001227 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder0ce1a792012-07-03 16:01:18 -05001228 rc = __rbd_refresh_header(rbd_dev);
Josh Durgina71b8912011-12-05 18:10:44 -08001229 hver = rbd_dev->header.obj_version;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001231 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001232 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001233 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001234
Josh Durgina71b8912011-12-05 18:10:44 -08001235 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001236}
1237
1238/*
1239 * Request sync osd watch
1240 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001241static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001242 const char *object_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001243 u64 ver)
1244{
1245 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001246 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001247
1248 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1249 if (ret < 0)
1250 return ret;
1251
1252 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001253 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001254 if (ret < 0)
1255 goto fail;
1256
1257 ops[0].watch.ver = cpu_to_le64(ver);
Alex Elder0ce1a792012-07-03 16:01:18 -05001258 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001259 ops[0].watch.flag = 1;
1260
Alex Elder0ce1a792012-07-03 16:01:18 -05001261 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001262 CEPH_NOSNAP,
1263 0,
1264 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1265 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001266 object_name, 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001267 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001268
1269 if (ret < 0)
1270 goto fail_event;
1271
1272 rbd_destroy_ops(ops);
1273 return 0;
1274
1275fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001276 ceph_osdc_cancel_event(rbd_dev->watch_event);
1277 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001278fail:
1279 rbd_destroy_ops(ops);
1280 return ret;
1281}
1282
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001283/*
1284 * Request sync osd unwatch
1285 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001286static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001287 const char *object_name)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001288{
1289 struct ceph_osd_req_op *ops;
1290
1291 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1292 if (ret < 0)
1293 return ret;
1294
1295 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001296 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001297 ops[0].watch.flag = 0;
1298
Alex Elder0ce1a792012-07-03 16:01:18 -05001299 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001300 CEPH_NOSNAP,
1301 0,
1302 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1303 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001304 object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001305
1306 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001307 ceph_osdc_cancel_event(rbd_dev->watch_event);
1308 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001309 return ret;
1310}
1311
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001312struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001313 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001314};
1315
1316static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1317{
Alex Elder0ce1a792012-07-03 16:01:18 -05001318 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1319 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001320 return;
1321
Alex Elder0ce1a792012-07-03 16:01:18 -05001322 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
Alex Elder0bed54d2012-07-03 16:01:18 -05001323 rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001324 notify_id, (int)opcode);
1325}
1326
1327/*
1328 * Request sync osd notify
1329 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001330static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001331 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001332{
1333 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001334 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001335 struct ceph_osd_event *event;
1336 struct rbd_notify_info info;
1337 int payload_len = sizeof(u32) + sizeof(u32);
1338 int ret;
1339
1340 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1341 if (ret < 0)
1342 return ret;
1343
Alex Elder0ce1a792012-07-03 16:01:18 -05001344 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001345
1346 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1347 (void *)&info, &event);
1348 if (ret < 0)
1349 goto fail;
1350
1351 ops[0].watch.ver = 1;
1352 ops[0].watch.flag = 1;
1353 ops[0].watch.cookie = event->cookie;
1354 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1355 ops[0].watch.timeout = 12;
1356
Alex Elder0ce1a792012-07-03 16:01:18 -05001357 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001358 CEPH_NOSNAP,
1359 0,
1360 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1361 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001362 object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001363 if (ret < 0)
1364 goto fail_event;
1365
1366 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1367 dout("ceph_osdc_wait_event returned %d\n", ret);
1368 rbd_destroy_ops(ops);
1369 return 0;
1370
1371fail_event:
1372 ceph_osdc_cancel_event(event);
1373fail:
1374 rbd_destroy_ops(ops);
1375 return ret;
1376}
1377
1378/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001379 * Request sync osd read
1380 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001381static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001382 const char *object_name,
1383 const char *class_name,
1384 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001385 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386 int len,
1387 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001388{
1389 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001390 int class_name_len = strlen(class_name);
1391 int method_name_len = strlen(method_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001392 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001393 class_name_len + method_name_len + len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001394 if (ret < 0)
1395 return ret;
1396
Alex Elderaded07e2012-07-03 16:01:18 -05001397 ops[0].cls.class_name = class_name;
1398 ops[0].cls.class_len = (__u8) class_name_len;
1399 ops[0].cls.method_name = method_name;
1400 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001401 ops[0].cls.argc = 0;
1402 ops[0].cls.indata = data;
1403 ops[0].cls.indata_len = len;
1404
Alex Elder0ce1a792012-07-03 16:01:18 -05001405 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001406 CEPH_NOSNAP,
1407 0,
1408 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1409 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001410 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001411
1412 rbd_destroy_ops(ops);
1413
1414 dout("cls_exec returned %d\n", ret);
1415 return ret;
1416}
1417
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001418static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1419{
1420 struct rbd_req_coll *coll =
1421 kzalloc(sizeof(struct rbd_req_coll) +
1422 sizeof(struct rbd_req_status) * num_reqs,
1423 GFP_ATOMIC);
1424
1425 if (!coll)
1426 return NULL;
1427 coll->total = num_reqs;
1428 kref_init(&coll->kref);
1429 return coll;
1430}
1431
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001432/*
1433 * block device queue callback
1434 */
1435static void rbd_rq_fn(struct request_queue *q)
1436{
1437 struct rbd_device *rbd_dev = q->queuedata;
1438 struct request *rq;
1439 struct bio_pair *bp = NULL;
1440
Alex Elder00f1f362012-02-07 12:03:36 -06001441 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001442 struct bio *bio;
1443 struct bio *rq_bio, *next_bio = NULL;
1444 bool do_write;
1445 int size, op_size = 0;
1446 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001447 int num_segs, cur_seg = 0;
1448 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001449 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001450
1451 /* peek at request from block layer */
1452 if (!rq)
1453 break;
1454
1455 dout("fetched request\n");
1456
1457 /* filter out block requests we don't understand */
1458 if ((rq->cmd_type != REQ_TYPE_FS)) {
1459 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001460 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001461 }
1462
1463 /* deduce our operation (read, write) */
1464 do_write = (rq_data_dir(rq) == WRITE);
1465
1466 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001467 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001468 rq_bio = rq->bio;
1469 if (do_write && rbd_dev->read_only) {
1470 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001471 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001472 }
1473
1474 spin_unlock_irq(q->queue_lock);
1475
Josh Durgind1d25642011-12-05 14:03:05 -08001476 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001477
Josh Durgind1d25642011-12-05 14:03:05 -08001478 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001479 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001480 dout("request for non-existent snapshot");
1481 spin_lock_irq(q->queue_lock);
1482 __blk_end_request_all(rq, -ENXIO);
1483 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001484 }
1485
Josh Durgind1d25642011-12-05 14:03:05 -08001486 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1487
1488 up_read(&rbd_dev->header_rwsem);
1489
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490 dout("%s 0x%x bytes at 0x%llx\n",
1491 do_write ? "write" : "read",
Alex Elder593a9e72012-02-07 12:03:37 -06001492 size, blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001493
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001494 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1495 coll = rbd_alloc_coll(num_segs);
1496 if (!coll) {
1497 spin_lock_irq(q->queue_lock);
1498 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001499 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001500 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001501 }
1502
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503 do {
1504 /* a bio clone to be passed down to OSD req */
1505 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1506 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001507 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001508 ofs, size,
1509 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001510 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1512 op_size, GFP_ATOMIC);
1513 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514 rbd_coll_end_req_index(rq, coll, cur_seg,
1515 -ENOMEM, op_size);
1516 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001517 }
1518
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001519
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520 /* init OSD command: write or read */
1521 if (do_write)
1522 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001523 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001524 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001525 op_size, bio,
1526 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527 else
1528 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001529 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001531 op_size, bio,
1532 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001534next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 size -= op_size;
1536 ofs += op_size;
1537
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001538 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 rq_bio = next_bio;
1540 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001541 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001542
1543 if (bp)
1544 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001545 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001546
1547 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 }
1549}
1550
1551/*
1552 * a queue callback. Makes sure that we don't create a bio that spans across
1553 * multiple osd objects. One exception would be with a single page bios,
1554 * which we handle later at bio_chain_clone
1555 */
1556static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1557 struct bio_vec *bvec)
1558{
1559 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001560 unsigned int chunk_sectors;
1561 sector_t sector;
1562 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001563 int max;
1564
Alex Elder593a9e72012-02-07 12:03:37 -06001565 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1566 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1567 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1568
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001569 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001570 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001571 if (max < 0)
1572 max = 0; /* bio_add cannot handle a negative return */
1573 if (max <= bvec->bv_len && bio_sectors == 0)
1574 return bvec->bv_len;
1575 return max;
1576}
1577
1578static void rbd_free_disk(struct rbd_device *rbd_dev)
1579{
1580 struct gendisk *disk = rbd_dev->disk;
1581
1582 if (!disk)
1583 return;
1584
1585 rbd_header_free(&rbd_dev->header);
1586
1587 if (disk->flags & GENHD_FL_UP)
1588 del_gendisk(disk);
1589 if (disk->queue)
1590 blk_cleanup_queue(disk->queue);
1591 put_disk(disk);
1592}
1593
1594/*
1595 * reload the ondisk the header
1596 */
1597static int rbd_read_header(struct rbd_device *rbd_dev,
1598 struct rbd_image_header *header)
1599{
1600 ssize_t rc;
1601 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001602 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001603 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001604 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001605
Alex Elder00f1f362012-02-07 12:03:36 -06001606 /*
1607 * First reads the fixed-size header to determine the number
1608 * of snapshots, then re-reads it, along with all snapshot
1609 * records as well as their stored names.
1610 */
1611 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001612 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001613 dh = kmalloc(len, GFP_KERNEL);
1614 if (!dh)
1615 return -ENOMEM;
1616
1617 rc = rbd_req_sync_read(rbd_dev,
1618 NULL, CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001619 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001620 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001621 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001622 if (rc < 0)
1623 goto out_dh;
1624
1625 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001626 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001627 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001628 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001629 " for image %s\n",
1630 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001631 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001632 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001633
Alex Elder00f1f362012-02-07 12:03:36 -06001634 if (snap_count == header->total_snaps)
1635 break;
1636
1637 snap_count = header->total_snaps;
1638 len = sizeof (*dh) +
1639 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1640 header->snap_names_len;
1641
1642 rbd_header_free(header);
1643 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001644 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001645 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001646
1647out_dh:
1648 kfree(dh);
1649 return rc;
1650}
1651
1652/*
1653 * create a snapshot
1654 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001655static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001656 const char *snap_name,
1657 gfp_t gfp_flags)
1658{
1659 int name_len = strlen(snap_name);
1660 u64 new_snapid;
1661 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001662 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001663 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001664 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001665
1666 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001667 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001668 return -EINVAL;
1669
Alex Elder0ce1a792012-07-03 16:01:18 -05001670 monc = &rbd_dev->rbd_client->client->monc;
1671 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001672 dout("created snapid=%lld\n", new_snapid);
1673 if (ret < 0)
1674 return ret;
1675
1676 data = kmalloc(name_len + 16, gfp_flags);
1677 if (!data)
1678 return -ENOMEM;
1679
Sage Weil916d4d62011-05-12 16:10:50 -07001680 p = data;
1681 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001682
Sage Weil916d4d62011-05-12 16:10:50 -07001683 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1684 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001685
Alex Elder0bed54d2012-07-03 16:01:18 -05001686 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001687 "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001688 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001689
Sage Weil916d4d62011-05-12 16:10:50 -07001690 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001691
1692 if (ret < 0)
1693 return ret;
1694
Alex Elder0ce1a792012-07-03 16:01:18 -05001695 down_write(&rbd_dev->header_rwsem);
1696 rbd_dev->header.snapc->seq = new_snapid;
1697 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001698
1699 return 0;
1700bad:
1701 return -ERANGE;
1702}
1703
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001704static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1705{
1706 struct rbd_snap *snap;
1707
1708 while (!list_empty(&rbd_dev->snaps)) {
1709 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1710 __rbd_remove_snap_dev(rbd_dev, snap);
1711 }
1712}
1713
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001714/*
1715 * only read the first part of the ondisk header, without the snaps info
1716 */
Josh Durgin263c6ca2011-12-05 10:43:42 -08001717static int __rbd_refresh_header(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001718{
1719 int ret;
1720 struct rbd_image_header h;
1721 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001722 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001723
1724 ret = rbd_read_header(rbd_dev, &h);
1725 if (ret < 0)
1726 return ret;
1727
Josh Durgina51aa0c2011-12-05 10:35:04 -08001728 down_write(&rbd_dev->header_rwsem);
1729
Sage Weil9db4b3e2011-04-19 22:49:06 -07001730 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001731 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1732 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1733
1734 dout("setting size to %llu sectors", (unsigned long long) size);
1735 set_capacity(rbd_dev->disk, size);
1736 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001737
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001738 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001739 if (rbd_dev->header.total_snaps &&
1740 rbd_dev->header.snapc->snaps[0] == snap_seq)
1741 /* pointing at the head, will need to follow that
1742 if head moves */
1743 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744
Alex Elder849b4262012-07-09 21:04:24 -05001745 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001747 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001748 /* osd requests may still refer to snapc */
1749 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001750
Josh Durgina71b8912011-12-05 18:10:44 -08001751 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001752 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001753 rbd_dev->header.total_snaps = h.total_snaps;
1754 rbd_dev->header.snapc = h.snapc;
1755 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001756 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001757 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001758 /* Free the extra copy of the object prefix */
1759 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1760 kfree(h.object_prefix);
1761
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001762 if (follow_seq)
1763 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1764 else
1765 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001766
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001767 ret = __rbd_init_snaps_header(rbd_dev);
1768
Josh Durginc6666012011-11-21 17:11:12 -08001769 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001770
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001771 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001772}
1773
1774static int rbd_init_disk(struct rbd_device *rbd_dev)
1775{
1776 struct gendisk *disk;
1777 struct request_queue *q;
1778 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001779 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780 u64 total_size = 0;
1781
1782 /* contact OSD, request size info about the object being mapped */
1783 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1784 if (rc)
1785 return rc;
1786
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001787 /* no need to lock here, as rbd_dev is not registered yet */
1788 rc = __rbd_init_snaps_header(rbd_dev);
1789 if (rc)
1790 return rc;
1791
Josh Durgincc9d7342011-11-21 18:19:13 -08001792 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001793 if (rc)
1794 return rc;
1795
1796 /* create gendisk info */
1797 rc = -ENOMEM;
1798 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1799 if (!disk)
1800 goto out;
1801
Alex Elderf0f8cef2012-01-29 13:57:44 -06001802 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001803 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001804 disk->major = rbd_dev->major;
1805 disk->first_minor = 0;
1806 disk->fops = &rbd_bd_ops;
1807 disk->private_data = rbd_dev;
1808
1809 /* init rq */
1810 rc = -ENOMEM;
1811 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1812 if (!q)
1813 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001814
Alex Elder593a9e72012-02-07 12:03:37 -06001815 /* We use the default size, but let's be explicit about it. */
1816 blk_queue_physical_block_size(q, SECTOR_SIZE);
1817
Josh Durgin029bcbd2011-07-22 11:35:23 -07001818 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001819 segment_size = rbd_obj_bytes(&rbd_dev->header);
1820 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1821 blk_queue_max_segment_size(q, segment_size);
1822 blk_queue_io_min(q, segment_size);
1823 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001824
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001825 blk_queue_merge_bvec(q, rbd_merge_bvec);
1826 disk->queue = q;
1827
1828 q->queuedata = rbd_dev;
1829
1830 rbd_dev->disk = disk;
1831 rbd_dev->q = q;
1832
1833 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001834 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001835 add_disk(disk);
1836
1837 pr_info("%s: added with size 0x%llx\n",
1838 disk->disk_name, (unsigned long long)total_size);
1839 return 0;
1840
1841out_disk:
1842 put_disk(disk);
1843out:
1844 return rc;
1845}
1846
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001847/*
1848 sysfs
1849*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001850
Alex Elder593a9e72012-02-07 12:03:37 -06001851static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1852{
1853 return container_of(dev, struct rbd_device, dev);
1854}
1855
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001856static ssize_t rbd_size_show(struct device *dev,
1857 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001858{
Alex Elder593a9e72012-02-07 12:03:37 -06001859 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001860 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001861
Josh Durgina51aa0c2011-12-05 10:35:04 -08001862 down_read(&rbd_dev->header_rwsem);
1863 size = get_capacity(rbd_dev->disk);
1864 up_read(&rbd_dev->header_rwsem);
1865
1866 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001867}
1868
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001869static ssize_t rbd_major_show(struct device *dev,
1870 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001871{
Alex Elder593a9e72012-02-07 12:03:37 -06001872 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001873
1874 return sprintf(buf, "%d\n", rbd_dev->major);
1875}
1876
1877static ssize_t rbd_client_id_show(struct device *dev,
1878 struct device_attribute *attr, char *buf)
1879{
Alex Elder593a9e72012-02-07 12:03:37 -06001880 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001881
Alex Elder1dbb4392012-01-24 10:08:37 -06001882 return sprintf(buf, "client%lld\n",
1883 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001884}
1885
1886static ssize_t rbd_pool_show(struct device *dev,
1887 struct device_attribute *attr, char *buf)
1888{
Alex Elder593a9e72012-02-07 12:03:37 -06001889 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001890
1891 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1892}
1893
Alex Elder9bb2f332012-07-12 10:46:35 -05001894static ssize_t rbd_pool_id_show(struct device *dev,
1895 struct device_attribute *attr, char *buf)
1896{
1897 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1898
1899 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1900}
1901
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001902static ssize_t rbd_name_show(struct device *dev,
1903 struct device_attribute *attr, char *buf)
1904{
Alex Elder593a9e72012-02-07 12:03:37 -06001905 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001906
Alex Elder0bed54d2012-07-03 16:01:18 -05001907 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001908}
1909
1910static ssize_t rbd_snap_show(struct device *dev,
1911 struct device_attribute *attr,
1912 char *buf)
1913{
Alex Elder593a9e72012-02-07 12:03:37 -06001914 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001915
1916 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1917}
1918
1919static ssize_t rbd_image_refresh(struct device *dev,
1920 struct device_attribute *attr,
1921 const char *buf,
1922 size_t size)
1923{
Alex Elder593a9e72012-02-07 12:03:37 -06001924 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001925 int rc;
1926 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001927
1928 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1929
Josh Durgin263c6ca2011-12-05 10:43:42 -08001930 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001931 if (rc < 0)
1932 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001933
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001934 mutex_unlock(&ctl_mutex);
1935 return ret;
1936}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001937
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001938static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1939static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1940static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1941static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001942static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001943static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1944static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1945static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1946static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001947
1948static struct attribute *rbd_attrs[] = {
1949 &dev_attr_size.attr,
1950 &dev_attr_major.attr,
1951 &dev_attr_client_id.attr,
1952 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001953 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001954 &dev_attr_name.attr,
1955 &dev_attr_current_snap.attr,
1956 &dev_attr_refresh.attr,
1957 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001958 NULL
1959};
1960
1961static struct attribute_group rbd_attr_group = {
1962 .attrs = rbd_attrs,
1963};
1964
1965static const struct attribute_group *rbd_attr_groups[] = {
1966 &rbd_attr_group,
1967 NULL
1968};
1969
1970static void rbd_sysfs_dev_release(struct device *dev)
1971{
1972}
1973
1974static struct device_type rbd_device_type = {
1975 .name = "rbd",
1976 .groups = rbd_attr_groups,
1977 .release = rbd_sysfs_dev_release,
1978};
1979
1980
1981/*
1982 sysfs - snapshots
1983*/
1984
1985static ssize_t rbd_snap_size_show(struct device *dev,
1986 struct device_attribute *attr,
1987 char *buf)
1988{
1989 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1990
Josh Durgin35915382011-12-05 18:25:13 -08001991 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001992}
1993
1994static ssize_t rbd_snap_id_show(struct device *dev,
1995 struct device_attribute *attr,
1996 char *buf)
1997{
1998 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1999
Josh Durgin35915382011-12-05 18:25:13 -08002000 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002001}
2002
2003static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2004static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2005
2006static struct attribute *rbd_snap_attrs[] = {
2007 &dev_attr_snap_size.attr,
2008 &dev_attr_snap_id.attr,
2009 NULL,
2010};
2011
2012static struct attribute_group rbd_snap_attr_group = {
2013 .attrs = rbd_snap_attrs,
2014};
2015
2016static void rbd_snap_dev_release(struct device *dev)
2017{
2018 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2019 kfree(snap->name);
2020 kfree(snap);
2021}
2022
2023static const struct attribute_group *rbd_snap_attr_groups[] = {
2024 &rbd_snap_attr_group,
2025 NULL
2026};
2027
2028static struct device_type rbd_snap_device_type = {
2029 .groups = rbd_snap_attr_groups,
2030 .release = rbd_snap_dev_release,
2031};
2032
2033static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
2034 struct rbd_snap *snap)
2035{
2036 list_del(&snap->node);
2037 device_unregister(&snap->dev);
2038}
2039
2040static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2041 struct rbd_snap *snap,
2042 struct device *parent)
2043{
2044 struct device *dev = &snap->dev;
2045 int ret;
2046
2047 dev->type = &rbd_snap_device_type;
2048 dev->parent = parent;
2049 dev->release = rbd_snap_dev_release;
2050 dev_set_name(dev, "snap_%s", snap->name);
2051 ret = device_register(dev);
2052
2053 return ret;
2054}
2055
2056static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2057 int i, const char *name,
2058 struct rbd_snap **snapp)
2059{
2060 int ret;
2061 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2062 if (!snap)
2063 return -ENOMEM;
2064 snap->name = kstrdup(name, GFP_KERNEL);
2065 snap->size = rbd_dev->header.snap_sizes[i];
2066 snap->id = rbd_dev->header.snapc->snaps[i];
2067 if (device_is_registered(&rbd_dev->dev)) {
2068 ret = rbd_register_snap_dev(rbd_dev, snap,
2069 &rbd_dev->dev);
2070 if (ret < 0)
2071 goto err;
2072 }
2073 *snapp = snap;
2074 return 0;
2075err:
2076 kfree(snap->name);
2077 kfree(snap);
2078 return ret;
2079}
2080
2081/*
2082 * search for the previous snap in a null delimited string list
2083 */
2084const char *rbd_prev_snap_name(const char *name, const char *start)
2085{
2086 if (name < start + 2)
2087 return NULL;
2088
2089 name -= 2;
2090 while (*name) {
2091 if (name == start)
2092 return start;
2093 name--;
2094 }
2095 return name + 1;
2096}
2097
2098/*
2099 * compare the old list of snapshots that we have to what's in the header
2100 * and update it accordingly. Note that the header holds the snapshots
2101 * in a reverse order (from newest to oldest) and we need to go from
2102 * older to new so that we don't get a duplicate snap name when
2103 * doing the process (e.g., removed snapshot and recreated a new
2104 * one with the same name.
2105 */
2106static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2107{
2108 const char *name, *first_name;
2109 int i = rbd_dev->header.total_snaps;
2110 struct rbd_snap *snap, *old_snap = NULL;
2111 int ret;
2112 struct list_head *p, *n;
2113
2114 first_name = rbd_dev->header.snap_names;
2115 name = first_name + rbd_dev->header.snap_names_len;
2116
2117 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2118 u64 cur_id;
2119
2120 old_snap = list_entry(p, struct rbd_snap, node);
2121
2122 if (i)
2123 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2124
2125 if (!i || old_snap->id < cur_id) {
Josh Durgine88a36e2011-11-21 18:14:25 -08002126 /*
2127 * old_snap->id was skipped, thus was
2128 * removed. If this rbd_dev is mapped to
2129 * the removed snapshot, record that it no
2130 * longer exists, to prevent further I/O.
2131 */
2132 if (rbd_dev->snap_id == old_snap->id)
2133 rbd_dev->snap_exists = false;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002134 __rbd_remove_snap_dev(rbd_dev, old_snap);
2135 continue;
2136 }
2137 if (old_snap->id == cur_id) {
2138 /* we have this snapshot already */
2139 i--;
2140 name = rbd_prev_snap_name(name, first_name);
2141 continue;
2142 }
2143 for (; i > 0;
2144 i--, name = rbd_prev_snap_name(name, first_name)) {
2145 if (!name) {
2146 WARN_ON(1);
2147 return -EINVAL;
2148 }
2149 cur_id = rbd_dev->header.snapc->snaps[i];
2150 /* snapshot removal? handle it above */
2151 if (cur_id >= old_snap->id)
2152 break;
2153 /* a new snapshot */
2154 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2155 if (ret < 0)
2156 return ret;
2157
2158 /* note that we add it backward so using n and not p */
2159 list_add(&snap->node, n);
2160 p = &snap->node;
2161 }
2162 }
2163 /* we're done going over the old snap list, just add what's left */
2164 for (; i > 0; i--) {
2165 name = rbd_prev_snap_name(name, first_name);
2166 if (!name) {
2167 WARN_ON(1);
2168 return -EINVAL;
2169 }
2170 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2171 if (ret < 0)
2172 return ret;
2173 list_add(&snap->node, &rbd_dev->snaps);
2174 }
2175
2176 return 0;
2177}
2178
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002179static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2180{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002181 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002182 struct device *dev;
2183 struct rbd_snap *snap;
2184
2185 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2186 dev = &rbd_dev->dev;
2187
2188 dev->bus = &rbd_bus_type;
2189 dev->type = &rbd_device_type;
2190 dev->parent = &rbd_root_dev;
2191 dev->release = rbd_dev_release;
2192 dev_set_name(dev, "%d", rbd_dev->id);
2193 ret = device_register(dev);
2194 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002195 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002196
2197 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2198 ret = rbd_register_snap_dev(rbd_dev, snap,
2199 &rbd_dev->dev);
2200 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002201 break;
2202 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002203out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002204 mutex_unlock(&ctl_mutex);
2205 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002206}
2207
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002208static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2209{
2210 device_unregister(&rbd_dev->dev);
2211}
2212
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002213static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2214{
2215 int ret, rc;
2216
2217 do {
Alex Elder0bed54d2012-07-03 16:01:18 -05002218 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002219 rbd_dev->header.obj_version);
2220 if (ret == -ERANGE) {
2221 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08002222 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002223 mutex_unlock(&ctl_mutex);
2224 if (rc < 0)
2225 return rc;
2226 }
2227 } while (ret == -ERANGE);
2228
2229 return ret;
2230}
2231
Alex Elder1ddbe942012-01-29 13:57:44 -06002232static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2233
2234/*
Alex Elder499afd52012-02-02 08:13:29 -06002235 * Get a unique rbd identifier for the given new rbd_dev, and add
2236 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002237 */
Alex Elder499afd52012-02-02 08:13:29 -06002238static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002239{
Alex Elder499afd52012-02-02 08:13:29 -06002240 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2241
2242 spin_lock(&rbd_dev_list_lock);
2243 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2244 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002245}
Alex Elderb7f23c32012-01-29 13:57:43 -06002246
Alex Elder1ddbe942012-01-29 13:57:44 -06002247/*
Alex Elder499afd52012-02-02 08:13:29 -06002248 * Remove an rbd_dev from the global list, and record that its
2249 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002250 */
Alex Elder499afd52012-02-02 08:13:29 -06002251static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002252{
Alex Elderd184f6b2012-01-29 13:57:44 -06002253 struct list_head *tmp;
2254 int rbd_id = rbd_dev->id;
2255 int max_id;
2256
2257 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002258
2259 spin_lock(&rbd_dev_list_lock);
2260 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002261
2262 /*
2263 * If the id being "put" is not the current maximum, there
2264 * is nothing special we need to do.
2265 */
2266 if (rbd_id != atomic64_read(&rbd_id_max)) {
2267 spin_unlock(&rbd_dev_list_lock);
2268 return;
2269 }
2270
2271 /*
2272 * We need to update the current maximum id. Search the
2273 * list to find out what it is. We're more likely to find
2274 * the maximum at the end, so search the list backward.
2275 */
2276 max_id = 0;
2277 list_for_each_prev(tmp, &rbd_dev_list) {
2278 struct rbd_device *rbd_dev;
2279
2280 rbd_dev = list_entry(tmp, struct rbd_device, node);
2281 if (rbd_id > max_id)
2282 max_id = rbd_id;
2283 }
Alex Elder499afd52012-02-02 08:13:29 -06002284 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002285
Alex Elder1ddbe942012-01-29 13:57:44 -06002286 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002287 * The max id could have been updated by rbd_id_get(), in
2288 * which case it now accurately reflects the new maximum.
2289 * Be careful not to overwrite the maximum value in that
2290 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002291 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002292 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002293}
2294
Alex Eldera725f65e2012-02-02 08:13:30 -06002295/*
Alex Eldere28fff262012-02-02 08:13:30 -06002296 * Skips over white space at *buf, and updates *buf to point to the
2297 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002298 * the token (string of non-white space characters) found. Note
2299 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002300 */
2301static inline size_t next_token(const char **buf)
2302{
2303 /*
2304 * These are the characters that produce nonzero for
2305 * isspace() in the "C" and "POSIX" locales.
2306 */
2307 const char *spaces = " \f\n\r\t\v";
2308
2309 *buf += strspn(*buf, spaces); /* Find start of token */
2310
2311 return strcspn(*buf, spaces); /* Return token length */
2312}
2313
2314/*
2315 * Finds the next token in *buf, and if the provided token buffer is
2316 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002317 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2318 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002319 *
2320 * Returns the length of the token found (not including the '\0').
2321 * Return value will be 0 if no token is found, and it will be >=
2322 * token_size if the token would not fit.
2323 *
Alex Elder593a9e72012-02-07 12:03:37 -06002324 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002325 * found token. Note that this occurs even if the token buffer is
2326 * too small to hold it.
2327 */
2328static inline size_t copy_token(const char **buf,
2329 char *token,
2330 size_t token_size)
2331{
2332 size_t len;
2333
2334 len = next_token(buf);
2335 if (len < token_size) {
2336 memcpy(token, *buf, len);
2337 *(token + len) = '\0';
2338 }
2339 *buf += len;
2340
2341 return len;
2342}
2343
2344/*
Alex Elderea3352f2012-07-09 21:04:23 -05002345 * Finds the next token in *buf, dynamically allocates a buffer big
2346 * enough to hold a copy of it, and copies the token into the new
2347 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2348 * that a duplicate buffer is created even for a zero-length token.
2349 *
2350 * Returns a pointer to the newly-allocated duplicate, or a null
2351 * pointer if memory for the duplicate was not available. If
2352 * the lenp argument is a non-null pointer, the length of the token
2353 * (not including the '\0') is returned in *lenp.
2354 *
2355 * If successful, the *buf pointer will be updated to point beyond
2356 * the end of the found token.
2357 *
2358 * Note: uses GFP_KERNEL for allocation.
2359 */
2360static inline char *dup_token(const char **buf, size_t *lenp)
2361{
2362 char *dup;
2363 size_t len;
2364
2365 len = next_token(buf);
2366 dup = kmalloc(len + 1, GFP_KERNEL);
2367 if (!dup)
2368 return NULL;
2369
2370 memcpy(dup, *buf, len);
2371 *(dup + len) = '\0';
2372 *buf += len;
2373
2374 if (lenp)
2375 *lenp = len;
2376
2377 return dup;
2378}
2379
2380/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002381 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002382 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2383 * on the list of monitor addresses and other options provided via
2384 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002385 *
2386 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002387 */
2388static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2389 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002390 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002391 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002392 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002393 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002394{
Alex Elderd22f76e2012-07-12 10:46:35 -05002395 size_t len;
2396 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002397
2398 /* The first four tokens are required */
2399
Alex Elder7ef32142012-02-02 08:13:30 -06002400 len = next_token(&buf);
2401 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002402 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002403 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002404 *mon_addrs = buf;
2405
2406 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002407
Alex Eldere28fff262012-02-02 08:13:30 -06002408 len = copy_token(&buf, options, options_size);
2409 if (!len || len >= options_size)
2410 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002411
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002412 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002413 rbd_dev->pool_name = dup_token(&buf, NULL);
2414 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002415 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002416
Alex Elder0bed54d2012-07-03 16:01:18 -05002417 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2418 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002419 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002420
Alex Eldercb8627c2012-07-09 21:04:23 -05002421 /* Create the name of the header object */
2422
Alex Elder0bed54d2012-07-03 16:01:18 -05002423 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002424 + sizeof (RBD_SUFFIX),
2425 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002426 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002427 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002428 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002429
Alex Eldere28fff262012-02-02 08:13:30 -06002430 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002431 * The snapshot name is optional. If none is is supplied,
2432 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002433 */
Alex Elder820a5f32012-07-09 21:04:24 -05002434 rbd_dev->snap_name = dup_token(&buf, &len);
2435 if (!rbd_dev->snap_name)
2436 goto out_err;
2437 if (!len) {
2438 /* Replace the empty name with the default */
2439 kfree(rbd_dev->snap_name);
2440 rbd_dev->snap_name
2441 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2442 if (!rbd_dev->snap_name)
2443 goto out_err;
2444
Alex Eldere28fff262012-02-02 08:13:30 -06002445 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2446 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002447 }
Alex Eldere28fff262012-02-02 08:13:30 -06002448
Alex Eldera725f65e2012-02-02 08:13:30 -06002449 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002450
2451out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002452 kfree(rbd_dev->header_name);
2453 kfree(rbd_dev->image_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002454 kfree(rbd_dev->pool_name);
2455 rbd_dev->pool_name = NULL;
2456
2457 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002458}
2459
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002460static ssize_t rbd_add(struct bus_type *bus,
2461 const char *buf,
2462 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002463{
Alex Eldercb8627c2012-07-09 21:04:23 -05002464 char *options;
2465 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002466 const char *mon_addrs = NULL;
2467 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002468 struct ceph_osd_client *osdc;
2469 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002470
2471 if (!try_module_get(THIS_MODULE))
2472 return -ENODEV;
2473
Alex Elder27cc2592012-02-02 08:13:30 -06002474 options = kmalloc(count, GFP_KERNEL);
2475 if (!options)
2476 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002477 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2478 if (!rbd_dev)
2479 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002480
2481 /* static rbd_device initialization */
2482 spin_lock_init(&rbd_dev->lock);
2483 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002484 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002485 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002486
Josh Durginc6666012011-11-21 17:11:12 -08002487 init_rwsem(&rbd_dev->header_rwsem);
Alex Elder0e805a12012-01-11 19:42:15 -08002488
Alex Elderd184f6b2012-01-29 13:57:44 -06002489 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002490 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002491
Alex Eldera725f65e2012-02-02 08:13:30 -06002492 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002493 BUILD_BUG_ON(DEV_NAME_LEN
2494 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2495 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
Alex Eldere124a822012-01-29 13:57:44 -06002496
Alex Eldera725f65e2012-02-02 08:13:30 -06002497 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002498 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002499 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002500 if (rc)
2501 goto err_put_id;
2502
Alex Elder5214ecc2012-02-02 08:13:30 -06002503 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2504 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002505 if (IS_ERR(rbd_dev->rbd_client)) {
2506 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002507 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002508 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002509
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002510 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002511 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002512 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2513 if (rc < 0)
2514 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002515 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002516
2517 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002518 rc = register_blkdev(0, rbd_dev->name);
2519 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002520 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002521 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002522
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002523 rc = rbd_bus_add_dev(rbd_dev);
2524 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002525 goto err_out_blkdev;
2526
Alex Elder32eec682012-02-08 16:11:14 -06002527 /*
2528 * At this point cleanup in the event of an error is the job
2529 * of the sysfs code (initiated by rbd_bus_del_dev()).
2530 *
2531 * Set up and announce blkdev mapping.
2532 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002533 rc = rbd_init_disk(rbd_dev);
2534 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002535 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002536
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002537 rc = rbd_init_watch_dev(rbd_dev);
2538 if (rc)
2539 goto err_out_bus;
2540
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002541 return count;
2542
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002543err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002544 /* this will also clean up rest of rbd_dev stuff */
2545
2546 rbd_bus_del_dev(rbd_dev);
2547 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002548 return rc;
2549
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002550err_out_blkdev:
2551 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2552err_out_client:
2553 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002554err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002555 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002556 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002557 kfree(rbd_dev->header_name);
2558 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002559 kfree(rbd_dev->pool_name);
2560 }
Alex Elder499afd52012-02-02 08:13:29 -06002561 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002562err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002563 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002564 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002565
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002566 dout("Error adding device %s\n", buf);
2567 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002568
2569 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002570}
2571
2572static struct rbd_device *__rbd_get_dev(unsigned long id)
2573{
2574 struct list_head *tmp;
2575 struct rbd_device *rbd_dev;
2576
Alex Eldere124a822012-01-29 13:57:44 -06002577 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002578 list_for_each(tmp, &rbd_dev_list) {
2579 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a822012-01-29 13:57:44 -06002580 if (rbd_dev->id == id) {
2581 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002582 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002583 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002584 }
Alex Eldere124a822012-01-29 13:57:44 -06002585 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002586 return NULL;
2587}
2588
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002589static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002590{
Alex Elder593a9e72012-02-07 12:03:37 -06002591 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002592
Alex Elder1dbb4392012-01-24 10:08:37 -06002593 if (rbd_dev->watch_request) {
2594 struct ceph_client *client = rbd_dev->rbd_client->client;
2595
2596 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002597 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002598 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002599 if (rbd_dev->watch_event)
Alex Elder0bed54d2012-07-03 16:01:18 -05002600 rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002601
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002602 rbd_put_client(rbd_dev);
2603
2604 /* clean up and free blkdev */
2605 rbd_free_disk(rbd_dev);
2606 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002607
2608 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002609 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002610 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002611 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002612 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002613 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002614 kfree(rbd_dev);
2615
2616 /* release module ref */
2617 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002618}
2619
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002620static ssize_t rbd_remove(struct bus_type *bus,
2621 const char *buf,
2622 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002623{
2624 struct rbd_device *rbd_dev = NULL;
2625 int target_id, rc;
2626 unsigned long ul;
2627 int ret = count;
2628
2629 rc = strict_strtoul(buf, 10, &ul);
2630 if (rc)
2631 return rc;
2632
2633 /* convert to int; abort if we lost anything in the conversion */
2634 target_id = (int) ul;
2635 if (target_id != ul)
2636 return -EINVAL;
2637
2638 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2639
2640 rbd_dev = __rbd_get_dev(target_id);
2641 if (!rbd_dev) {
2642 ret = -ENOENT;
2643 goto done;
2644 }
2645
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002646 __rbd_remove_all_snaps(rbd_dev);
2647 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002648
2649done:
2650 mutex_unlock(&ctl_mutex);
2651 return ret;
2652}
2653
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002654static ssize_t rbd_snap_add(struct device *dev,
2655 struct device_attribute *attr,
2656 const char *buf,
2657 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002658{
Alex Elder593a9e72012-02-07 12:03:37 -06002659 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002660 int ret;
2661 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002662 if (!name)
2663 return -ENOMEM;
2664
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002665 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666
2667 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2668
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002669 ret = rbd_header_add_snap(rbd_dev,
2670 name, GFP_KERNEL);
2671 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002672 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002673
Josh Durgin263c6ca2011-12-05 10:43:42 -08002674 ret = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002675 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002676 goto err_unlock;
2677
2678 /* shouldn't hold ctl_mutex when notifying.. notify might
2679 trigger a watch callback that would need to get that mutex */
2680 mutex_unlock(&ctl_mutex);
2681
2682 /* make a best effort, don't error if failed */
Alex Elder0bed54d2012-07-03 16:01:18 -05002683 rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002684
2685 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002686 kfree(name);
2687 return ret;
2688
2689err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002690 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002691 kfree(name);
2692 return ret;
2693}
2694
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002695/*
2696 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002697 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002698 */
2699static int rbd_sysfs_init(void)
2700{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002701 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002702
Alex Elderfed4c142012-02-07 12:03:36 -06002703 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002704 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002705 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002706
Alex Elderfed4c142012-02-07 12:03:36 -06002707 ret = bus_register(&rbd_bus_type);
2708 if (ret < 0)
2709 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002710
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002711 return ret;
2712}
2713
2714static void rbd_sysfs_cleanup(void)
2715{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002716 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002717 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002718}
2719
2720int __init rbd_init(void)
2721{
2722 int rc;
2723
2724 rc = rbd_sysfs_init();
2725 if (rc)
2726 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002727 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002728 return 0;
2729}
2730
2731void __exit rbd_exit(void)
2732{
2733 rbd_sysfs_cleanup();
2734}
2735
2736module_init(rbd_init);
2737module_exit(rbd_exit);
2738
2739MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2740MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2741MODULE_DESCRIPTION("rados block device");
2742
2743/* following authorship retained from original osdblk.c */
2744MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2745
2746MODULE_LICENSE("GPL");