blob: aaa19d8c367049f93c6ca90f684b3e32e770b2e7 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
44#define DRV_NAME "rbd"
45#define DRV_NAME_LONG "rbd (rados block device)"
46
47#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
48
Alex Elder21079782012-01-24 10:08:36 -060049#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
Yehuda Sadeh602adf42010-08-12 16:11:25 -070050#define RBD_MAX_POOL_NAME_LEN 64
51#define RBD_MAX_SNAP_NAME_LEN 32
52#define RBD_MAX_OPT_LEN 1024
53
54#define RBD_SNAP_HEAD_NAME "-"
55
56#define DEV_NAME_LEN 32
57
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070058#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
59
Yehuda Sadeh602adf42010-08-12 16:11:25 -070060/*
61 * block device image metadata (in-memory version)
62 */
63struct rbd_image_header {
64 u64 image_size;
65 char block_name[32];
66 __u8 obj_order;
67 __u8 crypt_type;
68 __u8 comp_type;
69 struct rw_semaphore snap_rwsem;
70 struct ceph_snap_context *snapc;
71 size_t snap_names_len;
72 u64 snap_seq;
73 u32 total_snaps;
74
75 char *snap_names;
76 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070077
78 u64 obj_version;
79};
80
81struct rbd_options {
82 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083};
84
85/*
86 * an instance of the client. multiple devices may share a client.
87 */
88struct rbd_client {
89 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091 struct kref kref;
92 struct list_head node;
93};
94
Yehuda Sadeh1fec7092011-05-13 13:52:56 -070095struct rbd_req_coll;
96
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097/*
98 * a single io request
99 */
100struct rbd_request {
101 struct request *rq; /* blk layer request */
102 struct bio *bio; /* cloned bio */
103 struct page **pages; /* list of used pages */
104 u64 len;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700105 int coll_index;
106 struct rbd_req_coll *coll;
107};
108
109struct rbd_req_status {
110 int done;
111 int rc;
112 u64 bytes;
113};
114
115/*
116 * a collection of requests
117 */
118struct rbd_req_coll {
119 int total;
120 int num_done;
121 struct kref kref;
122 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700123};
124
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800125struct rbd_snap {
126 struct device dev;
127 const char *name;
128 size_t size;
129 struct list_head node;
130 u64 id;
131};
132
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700133/*
134 * a single device
135 */
136struct rbd_device {
137 int id; /* blkdev unique id */
138
139 int major; /* blkdev assigned major */
140 struct gendisk *disk; /* blkdev's gendisk and rq */
141 struct request_queue *q;
142
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143 struct rbd_client *rbd_client;
144
145 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
146
147 spinlock_t lock; /* queue lock */
148
149 struct rbd_image_header header;
150 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
151 int obj_len;
152 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
153 char pool_name[RBD_MAX_POOL_NAME_LEN];
154 int poolid;
155
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700156 struct ceph_osd_event *watch_event;
157 struct ceph_osd_request *watch_request;
158
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700159 char snap_name[RBD_MAX_SNAP_NAME_LEN];
160 u32 cur_snap; /* index+1 of current snapshot within snap context
161 0 - for the head */
162 int read_only;
163
164 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800165
166 /* list of snapshots */
167 struct list_head snaps;
168
169 /* sysfs related */
170 struct device dev;
171};
172
173static struct bus_type rbd_bus_type = {
174 .name = "rbd",
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700175};
176
Alex Elder21079782012-01-24 10:08:36 -0600177static DEFINE_SPINLOCK(node_lock); /* protects client get/put */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700178
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700179static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
180static LIST_HEAD(rbd_dev_list); /* devices */
181static LIST_HEAD(rbd_client_list); /* clients */
182
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
184static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800185static ssize_t rbd_snap_add(struct device *dev,
186 struct device_attribute *attr,
187 const char *buf,
188 size_t count);
189static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700190 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800191
192
193static struct rbd_device *dev_to_rbd(struct device *dev)
194{
195 return container_of(dev, struct rbd_device, dev);
196}
197
198static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
199{
200 return get_device(&rbd_dev->dev);
201}
202
203static void rbd_put_dev(struct rbd_device *rbd_dev)
204{
205 put_device(&rbd_dev->dev);
206}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700207
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700208static int __rbd_update_snaps(struct rbd_device *rbd_dev);
209
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700210static int rbd_open(struct block_device *bdev, fmode_t mode)
211{
212 struct gendisk *disk = bdev->bd_disk;
213 struct rbd_device *rbd_dev = disk->private_data;
214
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800215 rbd_get_dev(rbd_dev);
216
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700217 set_device_ro(bdev, rbd_dev->read_only);
218
219 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
220 return -EROFS;
221
222 return 0;
223}
224
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800225static int rbd_release(struct gendisk *disk, fmode_t mode)
226{
227 struct rbd_device *rbd_dev = disk->private_data;
228
229 rbd_put_dev(rbd_dev);
230
231 return 0;
232}
233
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700234static const struct block_device_operations rbd_bd_ops = {
235 .owner = THIS_MODULE,
236 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800237 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700238};
239
240/*
241 * Initialize an rbd client instance.
242 * We own *opt.
243 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700244static struct rbd_client *rbd_client_create(struct ceph_options *opt,
245 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700246{
247 struct rbd_client *rbdc;
248 int ret = -ENOMEM;
249
250 dout("rbd_client_create\n");
251 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
252 if (!rbdc)
253 goto out_opt;
254
255 kref_init(&rbdc->kref);
256 INIT_LIST_HEAD(&rbdc->node);
257
Sage Weil6ab00d42011-08-09 09:41:59 -0700258 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700259 if (IS_ERR(rbdc->client))
260 goto out_rbdc;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400261 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700262
263 ret = ceph_open_session(rbdc->client);
264 if (ret < 0)
265 goto out_err;
266
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700267 rbdc->rbd_opts = rbd_opts;
268
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269 spin_lock(&node_lock);
270 list_add_tail(&rbdc->node, &rbd_client_list);
271 spin_unlock(&node_lock);
272
273 dout("rbd_client_create created %p\n", rbdc);
274 return rbdc;
275
276out_err:
277 ceph_destroy_client(rbdc->client);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700278out_rbdc:
279 kfree(rbdc);
280out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400281 if (opt)
282 ceph_destroy_options(opt);
283 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284}
285
286/*
287 * Find a ceph client with specific addr and configuration.
288 */
289static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
290{
291 struct rbd_client *client_node;
292
293 if (opt->flags & CEPH_OPT_NOSHARE)
294 return NULL;
295
296 list_for_each_entry(client_node, &rbd_client_list, node)
297 if (ceph_compare_options(opt, client_node->client) == 0)
298 return client_node;
299 return NULL;
300}
301
302/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700303 * mount options
304 */
305enum {
306 Opt_notify_timeout,
307 Opt_last_int,
308 /* int args above */
309 Opt_last_string,
310 /* string args above */
311};
312
313static match_table_t rbdopt_tokens = {
314 {Opt_notify_timeout, "notify_timeout=%d"},
315 /* int args above */
316 /* string args above */
317 {-1, NULL}
318};
319
320static int parse_rbd_opts_token(char *c, void *private)
321{
322 struct rbd_options *rbdopt = private;
323 substring_t argstr[MAX_OPT_ARGS];
324 int token, intval, ret;
325
Alex Elder21079782012-01-24 10:08:36 -0600326 token = match_token(c, rbdopt_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700327 if (token < 0)
328 return -EINVAL;
329
330 if (token < Opt_last_int) {
331 ret = match_int(&argstr[0], &intval);
332 if (ret < 0) {
333 pr_err("bad mount option arg (not int) "
334 "at '%s'\n", c);
335 return ret;
336 }
337 dout("got int token %d val %d\n", token, intval);
338 } else if (token > Opt_last_int && token < Opt_last_string) {
339 dout("got string token %d val %s\n", token,
340 argstr[0].from);
341 } else {
342 dout("got token %d\n", token);
343 }
344
345 switch (token) {
346 case Opt_notify_timeout:
347 rbdopt->notify_timeout = intval;
348 break;
349 default:
350 BUG_ON(token);
351 }
352 return 0;
353}
354
355/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356 * Get a ceph client with specific addr and configuration, if one does
357 * not exist create it.
358 */
359static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
360 char *options)
361{
362 struct rbd_client *rbdc;
363 struct ceph_options *opt;
364 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700365 struct rbd_options *rbd_opts;
366
367 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
368 if (!rbd_opts)
369 return -ENOMEM;
370
371 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700372
Alex Elderee577412012-01-24 10:08:36 -0600373 opt = ceph_parse_options(options, mon_addr,
Alex Elder21079782012-01-24 10:08:36 -0600374 mon_addr + strlen(mon_addr),
375 parse_rbd_opts_token, rbd_opts);
Alex Elderee577412012-01-24 10:08:36 -0600376 if (IS_ERR(opt)) {
377 ret = PTR_ERR(opt);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700378 goto done_err;
Alex Elderee577412012-01-24 10:08:36 -0600379 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380
381 spin_lock(&node_lock);
382 rbdc = __rbd_client_find(opt);
383 if (rbdc) {
384 ceph_destroy_options(opt);
Alex Elder97bb59a2012-01-24 10:08:36 -0600385 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700386
387 /* using an existing client */
388 kref_get(&rbdc->kref);
389 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700390 spin_unlock(&node_lock);
391 return 0;
392 }
393 spin_unlock(&node_lock);
394
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700395 rbdc = rbd_client_create(opt, rbd_opts);
396 if (IS_ERR(rbdc)) {
397 ret = PTR_ERR(rbdc);
398 goto done_err;
399 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700400
401 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700402 return 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700403done_err:
404 kfree(rbd_opts);
405 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406}
407
408/*
409 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600410 *
411 * Caller must hold node_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700412 */
413static void rbd_client_release(struct kref *kref)
414{
415 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
416
417 dout("rbd_release_client %p\n", rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700418 list_del(&rbdc->node);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419
420 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700421 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700422 kfree(rbdc);
423}
424
425/*
426 * Drop reference to ceph client node. If it's not referenced anymore, release
427 * it.
428 */
429static void rbd_put_client(struct rbd_device *rbd_dev)
430{
Alex Elderd23a4b32012-01-29 13:57:43 -0600431 spin_lock(&node_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
Alex Elderd23a4b32012-01-29 13:57:43 -0600433 spin_unlock(&node_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700434 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435}
436
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700437/*
438 * Destroy requests collection
439 */
440static void rbd_coll_release(struct kref *kref)
441{
442 struct rbd_req_coll *coll =
443 container_of(kref, struct rbd_req_coll, kref);
444
445 dout("rbd_coll_release %p\n", coll);
446 kfree(coll);
447}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700448
449/*
450 * Create a new header structure, translate header format from the on-disk
451 * header.
452 */
453static int rbd_header_from_disk(struct rbd_image_header *header,
454 struct rbd_image_header_ondisk *ondisk,
455 int allocated_snaps,
456 gfp_t gfp_flags)
457{
458 int i;
459 u32 snap_count = le32_to_cpu(ondisk->snap_count);
460 int ret = -ENOMEM;
461
Alex Elder21079782012-01-24 10:08:36 -0600462 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800463 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800464
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465 init_rwsem(&header->snap_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700466 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
467 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Alex Elder21079782012-01-24 10:08:36 -0600468 snap_count * sizeof (*ondisk),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 gfp_flags);
470 if (!header->snapc)
471 return -ENOMEM;
472 if (snap_count) {
473 header->snap_names = kmalloc(header->snap_names_len,
474 GFP_KERNEL);
475 if (!header->snap_names)
476 goto err_snapc;
477 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
478 GFP_KERNEL);
479 if (!header->snap_sizes)
480 goto err_names;
481 } else {
482 header->snap_names = NULL;
483 header->snap_sizes = NULL;
484 }
485 memcpy(header->block_name, ondisk->block_name,
486 sizeof(ondisk->block_name));
487
488 header->image_size = le64_to_cpu(ondisk->image_size);
489 header->obj_order = ondisk->options.order;
490 header->crypt_type = ondisk->options.crypt_type;
491 header->comp_type = ondisk->options.comp_type;
492
493 atomic_set(&header->snapc->nref, 1);
494 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
495 header->snapc->num_snaps = snap_count;
496 header->total_snaps = snap_count;
497
Alex Elder21079782012-01-24 10:08:36 -0600498 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700499 for (i = 0; i < snap_count; i++) {
500 header->snapc->snaps[i] =
501 le64_to_cpu(ondisk->snaps[i].id);
502 header->snap_sizes[i] =
503 le64_to_cpu(ondisk->snaps[i].image_size);
504 }
505
506 /* copy snapshot names */
507 memcpy(header->snap_names, &ondisk->snaps[i],
508 header->snap_names_len);
509 }
510
511 return 0;
512
513err_names:
514 kfree(header->snap_names);
515err_snapc:
516 kfree(header->snapc);
517 return ret;
518}
519
520static int snap_index(struct rbd_image_header *header, int snap_num)
521{
522 return header->total_snaps - snap_num;
523}
524
525static u64 cur_snap_id(struct rbd_device *rbd_dev)
526{
527 struct rbd_image_header *header = &rbd_dev->header;
528
529 if (!rbd_dev->cur_snap)
530 return 0;
531
532 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
533}
534
535static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
536 u64 *seq, u64 *size)
537{
538 int i;
539 char *p = header->snap_names;
540
541 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
542 if (strcmp(snap_name, p) == 0)
543 break;
544 }
545 if (i == header->total_snaps)
546 return -ENOENT;
547 if (seq)
548 *seq = header->snapc->snaps[i];
549
550 if (size)
551 *size = header->snap_sizes[i];
552
553 return i;
554}
555
Josh Durgincc9d7342011-11-21 18:19:13 -0800556static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557{
558 struct rbd_image_header *header = &dev->header;
559 struct ceph_snap_context *snapc = header->snapc;
560 int ret = -ENOENT;
561
Josh Durgincc9d7342011-11-21 18:19:13 -0800562 BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
563
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700564 down_write(&header->snap_rwsem);
565
Josh Durgincc9d7342011-11-21 18:19:13 -0800566 if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
567 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700568 if (header->total_snaps)
569 snapc->seq = header->snap_seq;
570 else
571 snapc->seq = 0;
572 dev->cur_snap = 0;
573 dev->read_only = 0;
574 if (size)
575 *size = header->image_size;
576 } else {
Josh Durgincc9d7342011-11-21 18:19:13 -0800577 ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 if (ret < 0)
579 goto done;
580
581 dev->cur_snap = header->total_snaps - ret;
582 dev->read_only = 1;
583 }
584
585 ret = 0;
586done:
587 up_write(&header->snap_rwsem);
588 return ret;
589}
590
591static void rbd_header_free(struct rbd_image_header *header)
592{
593 kfree(header->snapc);
594 kfree(header->snap_names);
595 kfree(header->snap_sizes);
596}
597
598/*
599 * get the actual striped segment name, offset and length
600 */
601static u64 rbd_get_segment(struct rbd_image_header *header,
602 const char *block_name,
603 u64 ofs, u64 len,
604 char *seg_name, u64 *segofs)
605{
606 u64 seg = ofs >> header->obj_order;
607
608 if (seg_name)
609 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
610 "%s.%012llx", block_name, seg);
611
612 ofs = ofs & ((1 << header->obj_order) - 1);
613 len = min_t(u64, len, (1 << header->obj_order) - ofs);
614
615 if (segofs)
616 *segofs = ofs;
617
618 return len;
619}
620
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700621static int rbd_get_num_segments(struct rbd_image_header *header,
622 u64 ofs, u64 len)
623{
624 u64 start_seg = ofs >> header->obj_order;
625 u64 end_seg = (ofs + len - 1) >> header->obj_order;
626 return end_seg - start_seg + 1;
627}
628
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700630 * returns the size of an object in the image
631 */
632static u64 rbd_obj_bytes(struct rbd_image_header *header)
633{
634 return 1 << header->obj_order;
635}
636
637/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638 * bio helpers
639 */
640
641static void bio_chain_put(struct bio *chain)
642{
643 struct bio *tmp;
644
645 while (chain) {
646 tmp = chain;
647 chain = chain->bi_next;
648 bio_put(tmp);
649 }
650}
651
652/*
653 * zeros a bio chain, starting at specific offset
654 */
655static void zero_bio_chain(struct bio *chain, int start_ofs)
656{
657 struct bio_vec *bv;
658 unsigned long flags;
659 void *buf;
660 int i;
661 int pos = 0;
662
663 while (chain) {
664 bio_for_each_segment(bv, chain, i) {
665 if (pos + bv->bv_len > start_ofs) {
666 int remainder = max(start_ofs - pos, 0);
667 buf = bvec_kmap_irq(bv, &flags);
668 memset(buf + remainder, 0,
669 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200670 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671 }
672 pos += bv->bv_len;
673 }
674
675 chain = chain->bi_next;
676 }
677}
678
679/*
680 * bio_chain_clone - clone a chain of bios up to a certain length.
681 * might return a bio_pair that will need to be released.
682 */
683static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
684 struct bio_pair **bp,
685 int len, gfp_t gfpmask)
686{
687 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
688 int total = 0;
689
690 if (*bp) {
691 bio_pair_release(*bp);
692 *bp = NULL;
693 }
694
695 while (old_chain && (total < len)) {
696 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
697 if (!tmp)
698 goto err_out;
699
700 if (total + old_chain->bi_size > len) {
701 struct bio_pair *bp;
702
703 /*
704 * this split can only happen with a single paged bio,
705 * split_bio will BUG_ON if this is not the case
706 */
707 dout("bio_chain_clone split! total=%d remaining=%d"
708 "bi_size=%d\n",
709 (int)total, (int)len-total,
710 (int)old_chain->bi_size);
711
712 /* split the bio. We'll release it either in the next
713 call, or it will have to be released outside */
714 bp = bio_split(old_chain, (len - total) / 512ULL);
715 if (!bp)
716 goto err_out;
717
718 __bio_clone(tmp, &bp->bio1);
719
720 *next = &bp->bio2;
721 } else {
722 __bio_clone(tmp, old_chain);
723 *next = old_chain->bi_next;
724 }
725
726 tmp->bi_bdev = NULL;
727 gfpmask &= ~__GFP_WAIT;
728 tmp->bi_next = NULL;
729
730 if (!new_chain) {
731 new_chain = tail = tmp;
732 } else {
733 tail->bi_next = tmp;
734 tail = tmp;
735 }
736 old_chain = old_chain->bi_next;
737
738 total += tmp->bi_size;
739 }
740
741 BUG_ON(total < len);
742
743 if (tail)
744 tail->bi_next = NULL;
745
746 *old = old_chain;
747
748 return new_chain;
749
750err_out:
751 dout("bio_chain_clone with err\n");
752 bio_chain_put(new_chain);
753 return NULL;
754}
755
756/*
757 * helpers for osd request op vectors.
758 */
759static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
760 int num_ops,
761 int opcode,
762 u32 payload_len)
763{
764 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
765 GFP_NOIO);
766 if (!*ops)
767 return -ENOMEM;
768 (*ops)[0].op = opcode;
769 /*
770 * op extent offset and length will be set later on
771 * in calc_raw_layout()
772 */
773 (*ops)[0].payload_len = payload_len;
774 return 0;
775}
776
777static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
778{
779 kfree(ops);
780}
781
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700782static void rbd_coll_end_req_index(struct request *rq,
783 struct rbd_req_coll *coll,
784 int index,
785 int ret, u64 len)
786{
787 struct request_queue *q;
788 int min, max, i;
789
790 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
791 coll, index, ret, len);
792
793 if (!rq)
794 return;
795
796 if (!coll) {
797 blk_end_request(rq, ret, len);
798 return;
799 }
800
801 q = rq->q;
802
803 spin_lock_irq(q->queue_lock);
804 coll->status[index].done = 1;
805 coll->status[index].rc = ret;
806 coll->status[index].bytes = len;
807 max = min = coll->num_done;
808 while (max < coll->total && coll->status[max].done)
809 max++;
810
811 for (i = min; i<max; i++) {
812 __blk_end_request(rq, coll->status[i].rc,
813 coll->status[i].bytes);
814 coll->num_done++;
815 kref_put(&coll->kref, rbd_coll_release);
816 }
817 spin_unlock_irq(q->queue_lock);
818}
819
820static void rbd_coll_end_req(struct rbd_request *req,
821 int ret, u64 len)
822{
823 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
824}
825
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700826/*
827 * Send ceph osd request
828 */
829static int rbd_do_request(struct request *rq,
830 struct rbd_device *dev,
831 struct ceph_snap_context *snapc,
832 u64 snapid,
833 const char *obj, u64 ofs, u64 len,
834 struct bio *bio,
835 struct page **pages,
836 int num_pages,
837 int flags,
838 struct ceph_osd_req_op *ops,
839 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700840 struct rbd_req_coll *coll,
841 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700842 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700843 struct ceph_msg *msg),
844 struct ceph_osd_request **linger_req,
845 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846{
847 struct ceph_osd_request *req;
848 struct ceph_file_layout *layout;
849 int ret;
850 u64 bno;
851 struct timespec mtime = CURRENT_TIME;
852 struct rbd_request *req_data;
853 struct ceph_osd_request_head *reqhead;
854 struct rbd_image_header *header = &dev->header;
Alex Elder1dbb4392012-01-24 10:08:37 -0600855 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700856
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700857 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700858 if (!req_data) {
859 if (coll)
860 rbd_coll_end_req_index(rq, coll, coll_index,
861 -ENOMEM, len);
862 return -ENOMEM;
863 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700864
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700865 if (coll) {
866 req_data->coll = coll;
867 req_data->coll_index = coll_index;
868 }
869
870 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700871
872 down_read(&header->snap_rwsem);
873
Alex Elder1dbb4392012-01-24 10:08:37 -0600874 osdc = &dev->rbd_client->client->osdc;
875 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
876 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700877 if (!req) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878 up_read(&header->snap_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700879 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700880 goto done_pages;
881 }
882
883 req->r_callback = rbd_cb;
884
885 req_data->rq = rq;
886 req_data->bio = bio;
887 req_data->pages = pages;
888 req_data->len = len;
889
890 req->r_priv = req_data;
891
892 reqhead = req->r_request->front.iov_base;
893 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
894
895 strncpy(req->r_oid, obj, sizeof(req->r_oid));
896 req->r_oid_len = strlen(req->r_oid);
897
898 layout = &req->r_file_layout;
899 memset(layout, 0, sizeof(*layout));
900 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
901 layout->fl_stripe_count = cpu_to_le32(1);
902 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
903 layout->fl_pg_preferred = cpu_to_le32(-1);
904 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
Alex Elder1dbb4392012-01-24 10:08:37 -0600905 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
906 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700907
908 ceph_osdc_build_request(req, ofs, &len,
909 ops,
910 snapc,
911 &mtime,
912 req->r_oid, req->r_oid_len);
913 up_read(&header->snap_rwsem);
914
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700915 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600916 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700917 *linger_req = req;
918 }
919
Alex Elder1dbb4392012-01-24 10:08:37 -0600920 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700921 if (ret < 0)
922 goto done_err;
923
924 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600925 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700926 if (ver)
927 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700928 dout("reassert_ver=%lld\n",
929 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700930 ceph_osdc_put_request(req);
931 }
932 return ret;
933
934done_err:
935 bio_chain_put(req_data->bio);
936 ceph_osdc_put_request(req);
937done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700938 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700939 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700940 return ret;
941}
942
943/*
944 * Ceph osd op callback
945 */
946static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
947{
948 struct rbd_request *req_data = req->r_priv;
949 struct ceph_osd_reply_head *replyhead;
950 struct ceph_osd_op *op;
951 __s32 rc;
952 u64 bytes;
953 int read_op;
954
955 /* parse reply */
956 replyhead = msg->front.iov_base;
957 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
958 op = (void *)(replyhead + 1);
959 rc = le32_to_cpu(replyhead->result);
960 bytes = le64_to_cpu(op->extent.length);
961 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
962
963 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
964
965 if (rc == -ENOENT && read_op) {
966 zero_bio_chain(req_data->bio, 0);
967 rc = 0;
968 } else if (rc == 0 && read_op && bytes < req_data->len) {
969 zero_bio_chain(req_data->bio, bytes);
970 bytes = req_data->len;
971 }
972
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700973 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700974
975 if (req_data->bio)
976 bio_chain_put(req_data->bio);
977
978 ceph_osdc_put_request(req);
979 kfree(req_data);
980}
981
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700982static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
983{
984 ceph_osdc_put_request(req);
985}
986
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700987/*
988 * Do a synchronous ceph osd operation
989 */
990static int rbd_req_sync_op(struct rbd_device *dev,
991 struct ceph_snap_context *snapc,
992 u64 snapid,
993 int opcode,
994 int flags,
995 struct ceph_osd_req_op *orig_ops,
996 int num_reply,
997 const char *obj,
998 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700999 char *buf,
1000 struct ceph_osd_request **linger_req,
1001 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001002{
1003 int ret;
1004 struct page **pages;
1005 int num_pages;
1006 struct ceph_osd_req_op *ops = orig_ops;
1007 u32 payload_len;
1008
1009 num_pages = calc_pages_for(ofs , len);
1010 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001011 if (IS_ERR(pages))
1012 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001013
1014 if (!orig_ops) {
1015 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1016 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1017 if (ret < 0)
1018 goto done;
1019
1020 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1021 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1022 if (ret < 0)
1023 goto done_ops;
1024 }
1025 }
1026
1027 ret = rbd_do_request(NULL, dev, snapc, snapid,
1028 obj, ofs, len, NULL,
1029 pages, num_pages,
1030 flags,
1031 ops,
1032 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001033 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001034 NULL,
1035 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036 if (ret < 0)
1037 goto done_ops;
1038
1039 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1040 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1041
1042done_ops:
1043 if (!orig_ops)
1044 rbd_destroy_ops(ops);
1045done:
1046 ceph_release_page_vector(pages, num_pages);
1047 return ret;
1048}
1049
1050/*
1051 * Do an asynchronous ceph osd operation
1052 */
1053static int rbd_do_op(struct request *rq,
1054 struct rbd_device *rbd_dev ,
1055 struct ceph_snap_context *snapc,
1056 u64 snapid,
1057 int opcode, int flags, int num_reply,
1058 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001059 struct bio *bio,
1060 struct rbd_req_coll *coll,
1061 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001062{
1063 char *seg_name;
1064 u64 seg_ofs;
1065 u64 seg_len;
1066 int ret;
1067 struct ceph_osd_req_op *ops;
1068 u32 payload_len;
1069
1070 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1071 if (!seg_name)
1072 return -ENOMEM;
1073
1074 seg_len = rbd_get_segment(&rbd_dev->header,
1075 rbd_dev->header.block_name,
1076 ofs, len,
1077 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001078
1079 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1080
1081 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1082 if (ret < 0)
1083 goto done;
1084
1085 /* we've taken care of segment sizes earlier when we
1086 cloned the bios. We should never have a segment
1087 truncated at this point */
1088 BUG_ON(seg_len < len);
1089
1090 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1091 seg_name, seg_ofs, seg_len,
1092 bio,
1093 NULL, 0,
1094 flags,
1095 ops,
1096 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001097 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001098 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001099
1100 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001101done:
1102 kfree(seg_name);
1103 return ret;
1104}
1105
1106/*
1107 * Request async osd write
1108 */
1109static int rbd_req_write(struct request *rq,
1110 struct rbd_device *rbd_dev,
1111 struct ceph_snap_context *snapc,
1112 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001113 struct bio *bio,
1114 struct rbd_req_coll *coll,
1115 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001116{
1117 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1118 CEPH_OSD_OP_WRITE,
1119 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1120 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001121 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122}
1123
1124/*
1125 * Request async osd read
1126 */
1127static int rbd_req_read(struct request *rq,
1128 struct rbd_device *rbd_dev,
1129 u64 snapid,
1130 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001131 struct bio *bio,
1132 struct rbd_req_coll *coll,
1133 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001134{
1135 return rbd_do_op(rq, rbd_dev, NULL,
1136 (snapid ? snapid : CEPH_NOSNAP),
1137 CEPH_OSD_OP_READ,
1138 CEPH_OSD_FLAG_READ,
1139 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001140 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001141}
1142
1143/*
1144 * Request sync osd read
1145 */
1146static int rbd_req_sync_read(struct rbd_device *dev,
1147 struct ceph_snap_context *snapc,
1148 u64 snapid,
1149 const char *obj,
1150 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001151 char *buf,
1152 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153{
1154 return rbd_req_sync_op(dev, NULL,
1155 (snapid ? snapid : CEPH_NOSNAP),
1156 CEPH_OSD_OP_READ,
1157 CEPH_OSD_FLAG_READ,
1158 NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001159 1, obj, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001160}
1161
1162/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001163 * Request sync osd watch
1164 */
1165static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1166 u64 ver,
1167 u64 notify_id,
1168 const char *obj)
1169{
1170 struct ceph_osd_req_op *ops;
1171 struct page **pages = NULL;
Sage Weil11f77002011-05-12 16:13:54 -07001172 int ret;
1173
1174 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001175 if (ret < 0)
1176 return ret;
1177
1178 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1179 ops[0].watch.cookie = notify_id;
1180 ops[0].watch.flag = 0;
1181
1182 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1183 obj, 0, 0, NULL,
1184 pages, 0,
1185 CEPH_OSD_FLAG_READ,
1186 ops,
1187 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001188 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001189 rbd_simple_req_cb, 0, NULL);
1190
1191 rbd_destroy_ops(ops);
1192 return ret;
1193}
1194
1195static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1196{
1197 struct rbd_device *dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001198 int rc;
1199
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001200 if (!dev)
1201 return;
1202
1203 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1204 notify_id, (int)opcode);
1205 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Sage Weil13143d22011-05-12 16:08:30 -07001206 rc = __rbd_update_snaps(dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001207 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001208 if (rc)
1209 pr_warning(DRV_NAME "%d got notification but failed to update"
1210 " snaps: %d\n", dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001211
1212 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1213}
1214
1215/*
1216 * Request sync osd watch
1217 */
1218static int rbd_req_sync_watch(struct rbd_device *dev,
1219 const char *obj,
1220 u64 ver)
1221{
1222 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001223 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001224
1225 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1226 if (ret < 0)
1227 return ret;
1228
1229 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1230 (void *)dev, &dev->watch_event);
1231 if (ret < 0)
1232 goto fail;
1233
1234 ops[0].watch.ver = cpu_to_le64(ver);
1235 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1236 ops[0].watch.flag = 1;
1237
1238 ret = rbd_req_sync_op(dev, NULL,
1239 CEPH_NOSNAP,
1240 0,
1241 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1242 ops,
1243 1, obj, 0, 0, NULL,
1244 &dev->watch_request, NULL);
1245
1246 if (ret < 0)
1247 goto fail_event;
1248
1249 rbd_destroy_ops(ops);
1250 return 0;
1251
1252fail_event:
1253 ceph_osdc_cancel_event(dev->watch_event);
1254 dev->watch_event = NULL;
1255fail:
1256 rbd_destroy_ops(ops);
1257 return ret;
1258}
1259
Yehuda Sadeh79e30572011-07-12 16:56:57 -07001260/*
1261 * Request sync osd unwatch
1262 */
1263static int rbd_req_sync_unwatch(struct rbd_device *dev,
1264 const char *obj)
1265{
1266 struct ceph_osd_req_op *ops;
1267
1268 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1269 if (ret < 0)
1270 return ret;
1271
1272 ops[0].watch.ver = 0;
1273 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1274 ops[0].watch.flag = 0;
1275
1276 ret = rbd_req_sync_op(dev, NULL,
1277 CEPH_NOSNAP,
1278 0,
1279 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1280 ops,
1281 1, obj, 0, 0, NULL, NULL, NULL);
1282
1283 rbd_destroy_ops(ops);
1284 ceph_osdc_cancel_event(dev->watch_event);
1285 dev->watch_event = NULL;
1286 return ret;
1287}
1288
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001289struct rbd_notify_info {
1290 struct rbd_device *dev;
1291};
1292
1293static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1294{
1295 struct rbd_device *dev = (struct rbd_device *)data;
1296 if (!dev)
1297 return;
1298
1299 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1300 notify_id, (int)opcode);
1301}
1302
1303/*
1304 * Request sync osd notify
1305 */
1306static int rbd_req_sync_notify(struct rbd_device *dev,
1307 const char *obj)
1308{
1309 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001310 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001311 struct ceph_osd_event *event;
1312 struct rbd_notify_info info;
1313 int payload_len = sizeof(u32) + sizeof(u32);
1314 int ret;
1315
1316 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1317 if (ret < 0)
1318 return ret;
1319
1320 info.dev = dev;
1321
1322 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1323 (void *)&info, &event);
1324 if (ret < 0)
1325 goto fail;
1326
1327 ops[0].watch.ver = 1;
1328 ops[0].watch.flag = 1;
1329 ops[0].watch.cookie = event->cookie;
1330 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1331 ops[0].watch.timeout = 12;
1332
1333 ret = rbd_req_sync_op(dev, NULL,
1334 CEPH_NOSNAP,
1335 0,
1336 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1337 ops,
1338 1, obj, 0, 0, NULL, NULL, NULL);
1339 if (ret < 0)
1340 goto fail_event;
1341
1342 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1343 dout("ceph_osdc_wait_event returned %d\n", ret);
1344 rbd_destroy_ops(ops);
1345 return 0;
1346
1347fail_event:
1348 ceph_osdc_cancel_event(event);
1349fail:
1350 rbd_destroy_ops(ops);
1351 return ret;
1352}
1353
1354/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001355 * Request sync osd read
1356 */
1357static int rbd_req_sync_exec(struct rbd_device *dev,
1358 const char *obj,
1359 const char *cls,
1360 const char *method,
1361 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001362 int len,
1363 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001364{
1365 struct ceph_osd_req_op *ops;
1366 int cls_len = strlen(cls);
1367 int method_len = strlen(method);
1368 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1369 cls_len + method_len + len);
1370 if (ret < 0)
1371 return ret;
1372
1373 ops[0].cls.class_name = cls;
1374 ops[0].cls.class_len = (__u8)cls_len;
1375 ops[0].cls.method_name = method;
1376 ops[0].cls.method_len = (__u8)method_len;
1377 ops[0].cls.argc = 0;
1378 ops[0].cls.indata = data;
1379 ops[0].cls.indata_len = len;
1380
1381 ret = rbd_req_sync_op(dev, NULL,
1382 CEPH_NOSNAP,
1383 0,
1384 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1385 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386 1, obj, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001387
1388 rbd_destroy_ops(ops);
1389
1390 dout("cls_exec returned %d\n", ret);
1391 return ret;
1392}
1393
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001394static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1395{
1396 struct rbd_req_coll *coll =
1397 kzalloc(sizeof(struct rbd_req_coll) +
1398 sizeof(struct rbd_req_status) * num_reqs,
1399 GFP_ATOMIC);
1400
1401 if (!coll)
1402 return NULL;
1403 coll->total = num_reqs;
1404 kref_init(&coll->kref);
1405 return coll;
1406}
1407
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001408/*
1409 * block device queue callback
1410 */
1411static void rbd_rq_fn(struct request_queue *q)
1412{
1413 struct rbd_device *rbd_dev = q->queuedata;
1414 struct request *rq;
1415 struct bio_pair *bp = NULL;
1416
1417 rq = blk_fetch_request(q);
1418
1419 while (1) {
1420 struct bio *bio;
1421 struct bio *rq_bio, *next_bio = NULL;
1422 bool do_write;
1423 int size, op_size = 0;
1424 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001425 int num_segs, cur_seg = 0;
1426 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001427
1428 /* peek at request from block layer */
1429 if (!rq)
1430 break;
1431
1432 dout("fetched request\n");
1433
1434 /* filter out block requests we don't understand */
1435 if ((rq->cmd_type != REQ_TYPE_FS)) {
1436 __blk_end_request_all(rq, 0);
1437 goto next;
1438 }
1439
1440 /* deduce our operation (read, write) */
1441 do_write = (rq_data_dir(rq) == WRITE);
1442
1443 size = blk_rq_bytes(rq);
1444 ofs = blk_rq_pos(rq) * 512ULL;
1445 rq_bio = rq->bio;
1446 if (do_write && rbd_dev->read_only) {
1447 __blk_end_request_all(rq, -EROFS);
1448 goto next;
1449 }
1450
1451 spin_unlock_irq(q->queue_lock);
1452
1453 dout("%s 0x%x bytes at 0x%llx\n",
1454 do_write ? "write" : "read",
1455 size, blk_rq_pos(rq) * 512ULL);
1456
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001457 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1458 coll = rbd_alloc_coll(num_segs);
1459 if (!coll) {
1460 spin_lock_irq(q->queue_lock);
1461 __blk_end_request_all(rq, -ENOMEM);
1462 goto next;
1463 }
1464
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001465 do {
1466 /* a bio clone to be passed down to OSD req */
1467 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1468 op_size = rbd_get_segment(&rbd_dev->header,
1469 rbd_dev->header.block_name,
1470 ofs, size,
1471 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001472 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1474 op_size, GFP_ATOMIC);
1475 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001476 rbd_coll_end_req_index(rq, coll, cur_seg,
1477 -ENOMEM, op_size);
1478 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001479 }
1480
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001481
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001482 /* init OSD command: write or read */
1483 if (do_write)
1484 rbd_req_write(rq, rbd_dev,
1485 rbd_dev->header.snapc,
1486 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001487 op_size, bio,
1488 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001489 else
1490 rbd_req_read(rq, rbd_dev,
1491 cur_snap_id(rbd_dev),
1492 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001493 op_size, bio,
1494 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001495
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001496next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001497 size -= op_size;
1498 ofs += op_size;
1499
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001500 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001501 rq_bio = next_bio;
1502 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001503 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001504
1505 if (bp)
1506 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001507 spin_lock_irq(q->queue_lock);
1508next:
1509 rq = blk_fetch_request(q);
1510 }
1511}
1512
1513/*
1514 * a queue callback. Makes sure that we don't create a bio that spans across
1515 * multiple osd objects. One exception would be with a single page bios,
1516 * which we handle later at bio_chain_clone
1517 */
1518static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1519 struct bio_vec *bvec)
1520{
1521 struct rbd_device *rbd_dev = q->queuedata;
1522 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1523 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1524 unsigned int bio_sectors = bmd->bi_size >> 9;
1525 int max;
1526
1527 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1528 + bio_sectors)) << 9;
1529 if (max < 0)
1530 max = 0; /* bio_add cannot handle a negative return */
1531 if (max <= bvec->bv_len && bio_sectors == 0)
1532 return bvec->bv_len;
1533 return max;
1534}
1535
1536static void rbd_free_disk(struct rbd_device *rbd_dev)
1537{
1538 struct gendisk *disk = rbd_dev->disk;
1539
1540 if (!disk)
1541 return;
1542
1543 rbd_header_free(&rbd_dev->header);
1544
1545 if (disk->flags & GENHD_FL_UP)
1546 del_gendisk(disk);
1547 if (disk->queue)
1548 blk_cleanup_queue(disk->queue);
1549 put_disk(disk);
1550}
1551
1552/*
1553 * reload the ondisk the header
1554 */
1555static int rbd_read_header(struct rbd_device *rbd_dev,
1556 struct rbd_image_header *header)
1557{
1558 ssize_t rc;
1559 struct rbd_image_header_ondisk *dh;
1560 int snap_count = 0;
1561 u64 snap_names_len = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001562 u64 ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001563
1564 while (1) {
1565 int len = sizeof(*dh) +
1566 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1567 snap_names_len;
1568
1569 rc = -ENOMEM;
1570 dh = kmalloc(len, GFP_KERNEL);
1571 if (!dh)
1572 return -ENOMEM;
1573
1574 rc = rbd_req_sync_read(rbd_dev,
1575 NULL, CEPH_NOSNAP,
1576 rbd_dev->obj_md_name,
1577 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001578 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001579 if (rc < 0)
1580 goto out_dh;
1581
1582 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001583 if (rc < 0) {
1584 if (rc == -ENXIO) {
1585 pr_warning("unrecognized header format"
1586 " for image %s", rbd_dev->obj);
1587 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001588 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001589 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590
1591 if (snap_count != header->total_snaps) {
1592 snap_count = header->total_snaps;
1593 snap_names_len = header->snap_names_len;
1594 rbd_header_free(header);
1595 kfree(dh);
1596 continue;
1597 }
1598 break;
1599 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001600 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001601
1602out_dh:
1603 kfree(dh);
1604 return rc;
1605}
1606
1607/*
1608 * create a snapshot
1609 */
1610static int rbd_header_add_snap(struct rbd_device *dev,
1611 const char *snap_name,
1612 gfp_t gfp_flags)
1613{
1614 int name_len = strlen(snap_name);
1615 u64 new_snapid;
1616 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001617 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001618 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001619 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001620
1621 /* we should create a snapshot only if we're pointing at the head */
1622 if (dev->cur_snap)
1623 return -EINVAL;
1624
Alex Elder1dbb4392012-01-24 10:08:37 -06001625 monc = &dev->rbd_client->client->monc;
1626 ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001627 dout("created snapid=%lld\n", new_snapid);
1628 if (ret < 0)
1629 return ret;
1630
1631 data = kmalloc(name_len + 16, gfp_flags);
1632 if (!data)
1633 return -ENOMEM;
1634
Sage Weil916d4d62011-05-12 16:10:50 -07001635 p = data;
1636 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001637
Sage Weil916d4d62011-05-12 16:10:50 -07001638 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1639 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001640
1641 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001642 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001643
Sage Weil916d4d62011-05-12 16:10:50 -07001644 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001645
1646 if (ret < 0)
1647 return ret;
1648
1649 dev->header.snapc->seq = new_snapid;
1650
1651 return 0;
1652bad:
1653 return -ERANGE;
1654}
1655
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001656static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1657{
1658 struct rbd_snap *snap;
1659
1660 while (!list_empty(&rbd_dev->snaps)) {
1661 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1662 __rbd_remove_snap_dev(rbd_dev, snap);
1663 }
1664}
1665
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001666/*
1667 * only read the first part of the ondisk header, without the snaps info
1668 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001669static int __rbd_update_snaps(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001670{
1671 int ret;
1672 struct rbd_image_header h;
1673 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001674 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001675
1676 ret = rbd_read_header(rbd_dev, &h);
1677 if (ret < 0)
1678 return ret;
1679
Sage Weil9db4b3e2011-04-19 22:49:06 -07001680 /* resized? */
1681 set_capacity(rbd_dev->disk, h.image_size / 512ULL);
1682
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001683 down_write(&rbd_dev->header.snap_rwsem);
1684
1685 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001686 if (rbd_dev->header.total_snaps &&
1687 rbd_dev->header.snapc->snaps[0] == snap_seq)
1688 /* pointing at the head, will need to follow that
1689 if head moves */
1690 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001691
1692 kfree(rbd_dev->header.snapc);
1693 kfree(rbd_dev->header.snap_names);
1694 kfree(rbd_dev->header.snap_sizes);
1695
1696 rbd_dev->header.total_snaps = h.total_snaps;
1697 rbd_dev->header.snapc = h.snapc;
1698 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001699 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001700 rbd_dev->header.snap_sizes = h.snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001701 if (follow_seq)
1702 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1703 else
1704 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001705
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001706 ret = __rbd_init_snaps_header(rbd_dev);
1707
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001708 up_write(&rbd_dev->header.snap_rwsem);
1709
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001710 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001711}
1712
1713static int rbd_init_disk(struct rbd_device *rbd_dev)
1714{
1715 struct gendisk *disk;
1716 struct request_queue *q;
1717 int rc;
1718 u64 total_size = 0;
1719
1720 /* contact OSD, request size info about the object being mapped */
1721 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1722 if (rc)
1723 return rc;
1724
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001725 /* no need to lock here, as rbd_dev is not registered yet */
1726 rc = __rbd_init_snaps_header(rbd_dev);
1727 if (rc)
1728 return rc;
1729
Josh Durgincc9d7342011-11-21 18:19:13 -08001730 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001731 if (rc)
1732 return rc;
1733
1734 /* create gendisk info */
1735 rc = -ENOMEM;
1736 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1737 if (!disk)
1738 goto out;
1739
Sage Weilaedfec52011-05-12 20:57:03 -07001740 snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
1741 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001742 disk->major = rbd_dev->major;
1743 disk->first_minor = 0;
1744 disk->fops = &rbd_bd_ops;
1745 disk->private_data = rbd_dev;
1746
1747 /* init rq */
1748 rc = -ENOMEM;
1749 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1750 if (!q)
1751 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001752
1753 /* set io sizes to object size */
1754 blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
1755 blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
1756 blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
1757 blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
1758
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001759 blk_queue_merge_bvec(q, rbd_merge_bvec);
1760 disk->queue = q;
1761
1762 q->queuedata = rbd_dev;
1763
1764 rbd_dev->disk = disk;
1765 rbd_dev->q = q;
1766
1767 /* finally, announce the disk to the world */
1768 set_capacity(disk, total_size / 512ULL);
1769 add_disk(disk);
1770
1771 pr_info("%s: added with size 0x%llx\n",
1772 disk->disk_name, (unsigned long long)total_size);
1773 return 0;
1774
1775out_disk:
1776 put_disk(disk);
1777out:
1778 return rc;
1779}
1780
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001781/*
1782 sysfs
1783*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001784
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001785static ssize_t rbd_size_show(struct device *dev,
1786 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001787{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001788 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1789
1790 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001791}
1792
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001793static ssize_t rbd_major_show(struct device *dev,
1794 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001795{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001796 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1797
1798 return sprintf(buf, "%d\n", rbd_dev->major);
1799}
1800
1801static ssize_t rbd_client_id_show(struct device *dev,
1802 struct device_attribute *attr, char *buf)
1803{
1804 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1805
Alex Elder1dbb4392012-01-24 10:08:37 -06001806 return sprintf(buf, "client%lld\n",
1807 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001808}
1809
1810static ssize_t rbd_pool_show(struct device *dev,
1811 struct device_attribute *attr, char *buf)
1812{
1813 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1814
1815 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1816}
1817
1818static ssize_t rbd_name_show(struct device *dev,
1819 struct device_attribute *attr, char *buf)
1820{
1821 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1822
1823 return sprintf(buf, "%s\n", rbd_dev->obj);
1824}
1825
1826static ssize_t rbd_snap_show(struct device *dev,
1827 struct device_attribute *attr,
1828 char *buf)
1829{
1830 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1831
1832 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1833}
1834
1835static ssize_t rbd_image_refresh(struct device *dev,
1836 struct device_attribute *attr,
1837 const char *buf,
1838 size_t size)
1839{
1840 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1841 int rc;
1842 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001843
1844 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1845
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001846 rc = __rbd_update_snaps(rbd_dev);
1847 if (rc < 0)
1848 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001849
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001850 mutex_unlock(&ctl_mutex);
1851 return ret;
1852}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001853
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001854static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1855static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1856static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1857static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1858static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1859static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1860static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1861static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001862
1863static struct attribute *rbd_attrs[] = {
1864 &dev_attr_size.attr,
1865 &dev_attr_major.attr,
1866 &dev_attr_client_id.attr,
1867 &dev_attr_pool.attr,
1868 &dev_attr_name.attr,
1869 &dev_attr_current_snap.attr,
1870 &dev_attr_refresh.attr,
1871 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001872 NULL
1873};
1874
1875static struct attribute_group rbd_attr_group = {
1876 .attrs = rbd_attrs,
1877};
1878
1879static const struct attribute_group *rbd_attr_groups[] = {
1880 &rbd_attr_group,
1881 NULL
1882};
1883
1884static void rbd_sysfs_dev_release(struct device *dev)
1885{
1886}
1887
1888static struct device_type rbd_device_type = {
1889 .name = "rbd",
1890 .groups = rbd_attr_groups,
1891 .release = rbd_sysfs_dev_release,
1892};
1893
1894
1895/*
1896 sysfs - snapshots
1897*/
1898
1899static ssize_t rbd_snap_size_show(struct device *dev,
1900 struct device_attribute *attr,
1901 char *buf)
1902{
1903 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1904
1905 return sprintf(buf, "%lld\n", (long long)snap->size);
1906}
1907
1908static ssize_t rbd_snap_id_show(struct device *dev,
1909 struct device_attribute *attr,
1910 char *buf)
1911{
1912 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1913
1914 return sprintf(buf, "%lld\n", (long long)snap->id);
1915}
1916
1917static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1918static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1919
1920static struct attribute *rbd_snap_attrs[] = {
1921 &dev_attr_snap_size.attr,
1922 &dev_attr_snap_id.attr,
1923 NULL,
1924};
1925
1926static struct attribute_group rbd_snap_attr_group = {
1927 .attrs = rbd_snap_attrs,
1928};
1929
1930static void rbd_snap_dev_release(struct device *dev)
1931{
1932 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1933 kfree(snap->name);
1934 kfree(snap);
1935}
1936
1937static const struct attribute_group *rbd_snap_attr_groups[] = {
1938 &rbd_snap_attr_group,
1939 NULL
1940};
1941
1942static struct device_type rbd_snap_device_type = {
1943 .groups = rbd_snap_attr_groups,
1944 .release = rbd_snap_dev_release,
1945};
1946
1947static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1948 struct rbd_snap *snap)
1949{
1950 list_del(&snap->node);
1951 device_unregister(&snap->dev);
1952}
1953
1954static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1955 struct rbd_snap *snap,
1956 struct device *parent)
1957{
1958 struct device *dev = &snap->dev;
1959 int ret;
1960
1961 dev->type = &rbd_snap_device_type;
1962 dev->parent = parent;
1963 dev->release = rbd_snap_dev_release;
1964 dev_set_name(dev, "snap_%s", snap->name);
1965 ret = device_register(dev);
1966
1967 return ret;
1968}
1969
1970static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
1971 int i, const char *name,
1972 struct rbd_snap **snapp)
1973{
1974 int ret;
1975 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
1976 if (!snap)
1977 return -ENOMEM;
1978 snap->name = kstrdup(name, GFP_KERNEL);
1979 snap->size = rbd_dev->header.snap_sizes[i];
1980 snap->id = rbd_dev->header.snapc->snaps[i];
1981 if (device_is_registered(&rbd_dev->dev)) {
1982 ret = rbd_register_snap_dev(rbd_dev, snap,
1983 &rbd_dev->dev);
1984 if (ret < 0)
1985 goto err;
1986 }
1987 *snapp = snap;
1988 return 0;
1989err:
1990 kfree(snap->name);
1991 kfree(snap);
1992 return ret;
1993}
1994
1995/*
1996 * search for the previous snap in a null delimited string list
1997 */
1998const char *rbd_prev_snap_name(const char *name, const char *start)
1999{
2000 if (name < start + 2)
2001 return NULL;
2002
2003 name -= 2;
2004 while (*name) {
2005 if (name == start)
2006 return start;
2007 name--;
2008 }
2009 return name + 1;
2010}
2011
2012/*
2013 * compare the old list of snapshots that we have to what's in the header
2014 * and update it accordingly. Note that the header holds the snapshots
2015 * in a reverse order (from newest to oldest) and we need to go from
2016 * older to new so that we don't get a duplicate snap name when
2017 * doing the process (e.g., removed snapshot and recreated a new
2018 * one with the same name.
2019 */
2020static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2021{
2022 const char *name, *first_name;
2023 int i = rbd_dev->header.total_snaps;
2024 struct rbd_snap *snap, *old_snap = NULL;
2025 int ret;
2026 struct list_head *p, *n;
2027
2028 first_name = rbd_dev->header.snap_names;
2029 name = first_name + rbd_dev->header.snap_names_len;
2030
2031 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2032 u64 cur_id;
2033
2034 old_snap = list_entry(p, struct rbd_snap, node);
2035
2036 if (i)
2037 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2038
2039 if (!i || old_snap->id < cur_id) {
2040 /* old_snap->id was skipped, thus was removed */
2041 __rbd_remove_snap_dev(rbd_dev, old_snap);
2042 continue;
2043 }
2044 if (old_snap->id == cur_id) {
2045 /* we have this snapshot already */
2046 i--;
2047 name = rbd_prev_snap_name(name, first_name);
2048 continue;
2049 }
2050 for (; i > 0;
2051 i--, name = rbd_prev_snap_name(name, first_name)) {
2052 if (!name) {
2053 WARN_ON(1);
2054 return -EINVAL;
2055 }
2056 cur_id = rbd_dev->header.snapc->snaps[i];
2057 /* snapshot removal? handle it above */
2058 if (cur_id >= old_snap->id)
2059 break;
2060 /* a new snapshot */
2061 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2062 if (ret < 0)
2063 return ret;
2064
2065 /* note that we add it backward so using n and not p */
2066 list_add(&snap->node, n);
2067 p = &snap->node;
2068 }
2069 }
2070 /* we're done going over the old snap list, just add what's left */
2071 for (; i > 0; i--) {
2072 name = rbd_prev_snap_name(name, first_name);
2073 if (!name) {
2074 WARN_ON(1);
2075 return -EINVAL;
2076 }
2077 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2078 if (ret < 0)
2079 return ret;
2080 list_add(&snap->node, &rbd_dev->snaps);
2081 }
2082
2083 return 0;
2084}
2085
2086
2087static void rbd_root_dev_release(struct device *dev)
2088{
2089}
2090
2091static struct device rbd_root_dev = {
2092 .init_name = "rbd",
2093 .release = rbd_root_dev_release,
2094};
2095
2096static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2097{
2098 int ret = -ENOMEM;
2099 struct device *dev;
2100 struct rbd_snap *snap;
2101
2102 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2103 dev = &rbd_dev->dev;
2104
2105 dev->bus = &rbd_bus_type;
2106 dev->type = &rbd_device_type;
2107 dev->parent = &rbd_root_dev;
2108 dev->release = rbd_dev_release;
2109 dev_set_name(dev, "%d", rbd_dev->id);
2110 ret = device_register(dev);
2111 if (ret < 0)
2112 goto done_free;
2113
2114 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2115 ret = rbd_register_snap_dev(rbd_dev, snap,
2116 &rbd_dev->dev);
2117 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002118 break;
2119 }
2120
2121 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002122 return 0;
2123done_free:
2124 mutex_unlock(&ctl_mutex);
2125 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002126}
2127
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002128static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2129{
2130 device_unregister(&rbd_dev->dev);
2131}
2132
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002133static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2134{
2135 int ret, rc;
2136
2137 do {
2138 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2139 rbd_dev->header.obj_version);
2140 if (ret == -ERANGE) {
2141 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2142 rc = __rbd_update_snaps(rbd_dev);
2143 mutex_unlock(&ctl_mutex);
2144 if (rc < 0)
2145 return rc;
2146 }
2147 } while (ret == -ERANGE);
2148
2149 return ret;
2150}
2151
Alex Elderb7f23c32012-01-29 13:57:43 -06002152/* caller must hold ctl_mutex */
2153static int rbd_id_get(void)
2154{
2155 struct list_head *tmp;
2156 int new_id = 0;
2157
2158 list_for_each(tmp, &rbd_dev_list) {
2159 struct rbd_device *rbd_dev;
2160
2161 rbd_dev = list_entry(tmp, struct rbd_device, node);
2162 if (rbd_dev->id >= new_id)
2163 new_id = rbd_dev->id + 1;
2164 }
2165
2166 return new_id;
2167}
2168
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002169static ssize_t rbd_add(struct bus_type *bus,
2170 const char *buf,
2171 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002172{
2173 struct ceph_osd_client *osdc;
2174 struct rbd_device *rbd_dev;
2175 ssize_t rc = -ENOMEM;
Alex Elderb7f23c32012-01-29 13:57:43 -06002176 int irc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002177 char *mon_dev_name;
2178 char *options;
2179
2180 if (!try_module_get(THIS_MODULE))
2181 return -ENODEV;
2182
2183 mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2184 if (!mon_dev_name)
2185 goto err_out_mod;
2186
2187 options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2188 if (!options)
2189 goto err_mon_dev;
2190
2191 /* new rbd_device object */
2192 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2193 if (!rbd_dev)
2194 goto err_out_opt;
2195
2196 /* static rbd_device initialization */
2197 spin_lock_init(&rbd_dev->lock);
2198 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002199 INIT_LIST_HEAD(&rbd_dev->snaps);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002200
Alex Elder0e805a12012-01-11 19:42:15 -08002201 init_rwsem(&rbd_dev->header.snap_rwsem);
2202
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002203 /* generate unique id: find highest unique id, add one */
2204 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2205
Alex Elderb7f23c32012-01-29 13:57:43 -06002206 rbd_dev->id = rbd_id_get();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002207
2208 /* add to global list */
2209 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2210
2211 /* parse add command */
2212 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
2213 "%" __stringify(RBD_MAX_OPT_LEN) "s "
2214 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
2215 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
2216 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
2217 mon_dev_name, options, rbd_dev->pool_name,
2218 rbd_dev->obj, rbd_dev->snap_name) < 4) {
2219 rc = -EINVAL;
2220 goto err_out_slot;
2221 }
2222
2223 if (rbd_dev->snap_name[0] == 0)
Josh Durgincc9d7342011-11-21 18:19:13 -08002224 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2225 sizeof (RBD_SNAP_HEAD_NAME));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002226
2227 rbd_dev->obj_len = strlen(rbd_dev->obj);
2228 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
2229 rbd_dev->obj, RBD_SUFFIX);
2230
2231 /* initialize rest of new object */
2232 snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
2233 rc = rbd_get_client(rbd_dev, mon_dev_name, options);
2234 if (rc < 0)
2235 goto err_out_slot;
2236
2237 mutex_unlock(&ctl_mutex);
2238
2239 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002240 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002241 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2242 if (rc < 0)
2243 goto err_out_client;
2244 rbd_dev->poolid = rc;
2245
2246 /* register our block device */
2247 irc = register_blkdev(0, rbd_dev->name);
2248 if (irc < 0) {
2249 rc = irc;
2250 goto err_out_client;
2251 }
2252 rbd_dev->major = irc;
2253
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002254 rc = rbd_bus_add_dev(rbd_dev);
2255 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002256 goto err_out_blkdev;
2257
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002258 /* set up and announce blkdev mapping */
2259 rc = rbd_init_disk(rbd_dev);
2260 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002261 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002262
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002263 rc = rbd_init_watch_dev(rbd_dev);
2264 if (rc)
2265 goto err_out_bus;
2266
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002267 return count;
2268
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002269err_out_bus:
2270 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2271 list_del_init(&rbd_dev->node);
2272 mutex_unlock(&ctl_mutex);
2273
2274 /* this will also clean up rest of rbd_dev stuff */
2275
2276 rbd_bus_del_dev(rbd_dev);
2277 kfree(options);
2278 kfree(mon_dev_name);
2279 return rc;
2280
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002281err_out_blkdev:
2282 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2283err_out_client:
2284 rbd_put_client(rbd_dev);
2285 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2286err_out_slot:
2287 list_del_init(&rbd_dev->node);
2288 mutex_unlock(&ctl_mutex);
2289
2290 kfree(rbd_dev);
2291err_out_opt:
2292 kfree(options);
2293err_mon_dev:
2294 kfree(mon_dev_name);
2295err_out_mod:
2296 dout("Error adding device %s\n", buf);
2297 module_put(THIS_MODULE);
2298 return rc;
2299}
2300
2301static struct rbd_device *__rbd_get_dev(unsigned long id)
2302{
2303 struct list_head *tmp;
2304 struct rbd_device *rbd_dev;
2305
2306 list_for_each(tmp, &rbd_dev_list) {
2307 rbd_dev = list_entry(tmp, struct rbd_device, node);
2308 if (rbd_dev->id == id)
2309 return rbd_dev;
2310 }
2311 return NULL;
2312}
2313
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002314static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002315{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002316 struct rbd_device *rbd_dev =
2317 container_of(dev, struct rbd_device, dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002318
Alex Elder1dbb4392012-01-24 10:08:37 -06002319 if (rbd_dev->watch_request) {
2320 struct ceph_client *client = rbd_dev->rbd_client->client;
2321
2322 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002323 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002324 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002325 if (rbd_dev->watch_event)
Yehuda Sadeh79e30572011-07-12 16:56:57 -07002326 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002327
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002328 rbd_put_client(rbd_dev);
2329
2330 /* clean up and free blkdev */
2331 rbd_free_disk(rbd_dev);
2332 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2333 kfree(rbd_dev);
2334
2335 /* release module ref */
2336 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002337}
2338
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002339static ssize_t rbd_remove(struct bus_type *bus,
2340 const char *buf,
2341 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002342{
2343 struct rbd_device *rbd_dev = NULL;
2344 int target_id, rc;
2345 unsigned long ul;
2346 int ret = count;
2347
2348 rc = strict_strtoul(buf, 10, &ul);
2349 if (rc)
2350 return rc;
2351
2352 /* convert to int; abort if we lost anything in the conversion */
2353 target_id = (int) ul;
2354 if (target_id != ul)
2355 return -EINVAL;
2356
2357 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2358
2359 rbd_dev = __rbd_get_dev(target_id);
2360 if (!rbd_dev) {
2361 ret = -ENOENT;
2362 goto done;
2363 }
2364
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002365 list_del_init(&rbd_dev->node);
2366
2367 __rbd_remove_all_snaps(rbd_dev);
2368 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002369
2370done:
2371 mutex_unlock(&ctl_mutex);
2372 return ret;
2373}
2374
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002375static ssize_t rbd_snap_add(struct device *dev,
2376 struct device_attribute *attr,
2377 const char *buf,
2378 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002379{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002380 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2381 int ret;
2382 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002383 if (!name)
2384 return -ENOMEM;
2385
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002386 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002387
2388 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2389
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002390 ret = rbd_header_add_snap(rbd_dev,
2391 name, GFP_KERNEL);
2392 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002393 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002394
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002395 ret = __rbd_update_snaps(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002396 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002397 goto err_unlock;
2398
2399 /* shouldn't hold ctl_mutex when notifying.. notify might
2400 trigger a watch callback that would need to get that mutex */
2401 mutex_unlock(&ctl_mutex);
2402
2403 /* make a best effort, don't error if failed */
2404 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002405
2406 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002407 kfree(name);
2408 return ret;
2409
2410err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002411 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002412 kfree(name);
2413 return ret;
2414}
2415
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002416static struct bus_attribute rbd_bus_attrs[] = {
2417 __ATTR(add, S_IWUSR, NULL, rbd_add),
2418 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002419 __ATTR_NULL
2420};
2421
2422/*
2423 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002424 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002425 */
2426static int rbd_sysfs_init(void)
2427{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002428 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002429
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002430 rbd_bus_type.bus_attrs = rbd_bus_attrs;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002431
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002432 ret = bus_register(&rbd_bus_type);
Alex Elder21079782012-01-24 10:08:36 -06002433 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002434 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002435
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002436 ret = device_register(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002437
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002438 return ret;
2439}
2440
2441static void rbd_sysfs_cleanup(void)
2442{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002443 device_unregister(&rbd_root_dev);
2444 bus_unregister(&rbd_bus_type);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002445}
2446
2447int __init rbd_init(void)
2448{
2449 int rc;
2450
2451 rc = rbd_sysfs_init();
2452 if (rc)
2453 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002454 pr_info("loaded " DRV_NAME_LONG "\n");
2455 return 0;
2456}
2457
2458void __exit rbd_exit(void)
2459{
2460 rbd_sysfs_cleanup();
2461}
2462
2463module_init(rbd_init);
2464module_exit(rbd_exit);
2465
2466MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2467MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2468MODULE_DESCRIPTION("rados block device");
2469
2470/* following authorship retained from original osdblk.c */
2471MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2472
2473MODULE_LICENSE("GPL");