blob: a828c6a276a845136979bd805fd57f47d2517ea2 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
44#define DRV_NAME "rbd"
45#define DRV_NAME_LONG "rbd (rados block device)"
46
47#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
48
49#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
50#define RBD_MAX_POOL_NAME_LEN 64
51#define RBD_MAX_SNAP_NAME_LEN 32
52#define RBD_MAX_OPT_LEN 1024
53
54#define RBD_SNAP_HEAD_NAME "-"
55
56#define DEV_NAME_LEN 32
57
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070058#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
59
Yehuda Sadeh602adf42010-08-12 16:11:25 -070060/*
61 * block device image metadata (in-memory version)
62 */
63struct rbd_image_header {
64 u64 image_size;
65 char block_name[32];
66 __u8 obj_order;
67 __u8 crypt_type;
68 __u8 comp_type;
69 struct rw_semaphore snap_rwsem;
70 struct ceph_snap_context *snapc;
71 size_t snap_names_len;
72 u64 snap_seq;
73 u32 total_snaps;
74
75 char *snap_names;
76 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070077
78 u64 obj_version;
79};
80
81struct rbd_options {
82 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083};
84
85/*
86 * an instance of the client. multiple devices may share a client.
87 */
88struct rbd_client {
89 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091 struct kref kref;
92 struct list_head node;
93};
94
Yehuda Sadeh1fec7092011-05-13 13:52:56 -070095struct rbd_req_coll;
96
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097/*
98 * a single io request
99 */
100struct rbd_request {
101 struct request *rq; /* blk layer request */
102 struct bio *bio; /* cloned bio */
103 struct page **pages; /* list of used pages */
104 u64 len;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700105 int coll_index;
106 struct rbd_req_coll *coll;
107};
108
109struct rbd_req_status {
110 int done;
111 int rc;
112 u64 bytes;
113};
114
115/*
116 * a collection of requests
117 */
118struct rbd_req_coll {
119 int total;
120 int num_done;
121 struct kref kref;
122 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700123};
124
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800125struct rbd_snap {
126 struct device dev;
127 const char *name;
128 size_t size;
129 struct list_head node;
130 u64 id;
131};
132
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700133/*
134 * a single device
135 */
136struct rbd_device {
137 int id; /* blkdev unique id */
138
139 int major; /* blkdev assigned major */
140 struct gendisk *disk; /* blkdev's gendisk and rq */
141 struct request_queue *q;
142
143 struct ceph_client *client;
144 struct rbd_client *rbd_client;
145
146 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
147
148 spinlock_t lock; /* queue lock */
149
150 struct rbd_image_header header;
151 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
152 int obj_len;
153 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
154 char pool_name[RBD_MAX_POOL_NAME_LEN];
155 int poolid;
156
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700157 struct ceph_osd_event *watch_event;
158 struct ceph_osd_request *watch_request;
159
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700160 char snap_name[RBD_MAX_SNAP_NAME_LEN];
161 u32 cur_snap; /* index+1 of current snapshot within snap context
162 0 - for the head */
163 int read_only;
164
165 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166
167 /* list of snapshots */
168 struct list_head snaps;
169
170 /* sysfs related */
171 struct device dev;
172};
173
174static struct bus_type rbd_bus_type = {
175 .name = "rbd",
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700176};
177
178static spinlock_t node_lock; /* protects client get/put */
179
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700180static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
181static LIST_HEAD(rbd_dev_list); /* devices */
182static LIST_HEAD(rbd_client_list); /* clients */
183
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800184static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
185static void rbd_dev_release(struct device *dev);
186static ssize_t rbd_snap_rollback(struct device *dev,
187 struct device_attribute *attr,
188 const char *buf,
189 size_t size);
190static ssize_t rbd_snap_add(struct device *dev,
191 struct device_attribute *attr,
192 const char *buf,
193 size_t count);
194static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700195 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800196
197
198static struct rbd_device *dev_to_rbd(struct device *dev)
199{
200 return container_of(dev, struct rbd_device, dev);
201}
202
203static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
204{
205 return get_device(&rbd_dev->dev);
206}
207
208static void rbd_put_dev(struct rbd_device *rbd_dev)
209{
210 put_device(&rbd_dev->dev);
211}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700212
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700213static int __rbd_update_snaps(struct rbd_device *rbd_dev);
214
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700215static int rbd_open(struct block_device *bdev, fmode_t mode)
216{
217 struct gendisk *disk = bdev->bd_disk;
218 struct rbd_device *rbd_dev = disk->private_data;
219
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800220 rbd_get_dev(rbd_dev);
221
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700222 set_device_ro(bdev, rbd_dev->read_only);
223
224 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
225 return -EROFS;
226
227 return 0;
228}
229
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800230static int rbd_release(struct gendisk *disk, fmode_t mode)
231{
232 struct rbd_device *rbd_dev = disk->private_data;
233
234 rbd_put_dev(rbd_dev);
235
236 return 0;
237}
238
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700239static const struct block_device_operations rbd_bd_ops = {
240 .owner = THIS_MODULE,
241 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800242 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700243};
244
245/*
246 * Initialize an rbd client instance.
247 * We own *opt.
248 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700249static struct rbd_client *rbd_client_create(struct ceph_options *opt,
250 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700251{
252 struct rbd_client *rbdc;
253 int ret = -ENOMEM;
254
255 dout("rbd_client_create\n");
256 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
257 if (!rbdc)
258 goto out_opt;
259
260 kref_init(&rbdc->kref);
261 INIT_LIST_HEAD(&rbdc->node);
262
Sage Weil6ab00d42011-08-09 09:41:59 -0700263 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700264 if (IS_ERR(rbdc->client))
265 goto out_rbdc;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400266 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267
268 ret = ceph_open_session(rbdc->client);
269 if (ret < 0)
270 goto out_err;
271
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700272 rbdc->rbd_opts = rbd_opts;
273
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700274 spin_lock(&node_lock);
275 list_add_tail(&rbdc->node, &rbd_client_list);
276 spin_unlock(&node_lock);
277
278 dout("rbd_client_create created %p\n", rbdc);
279 return rbdc;
280
281out_err:
282 ceph_destroy_client(rbdc->client);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700283out_rbdc:
284 kfree(rbdc);
285out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400286 if (opt)
287 ceph_destroy_options(opt);
288 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700289}
290
291/*
292 * Find a ceph client with specific addr and configuration.
293 */
294static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
295{
296 struct rbd_client *client_node;
297
298 if (opt->flags & CEPH_OPT_NOSHARE)
299 return NULL;
300
301 list_for_each_entry(client_node, &rbd_client_list, node)
302 if (ceph_compare_options(opt, client_node->client) == 0)
303 return client_node;
304 return NULL;
305}
306
307/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700308 * mount options
309 */
310enum {
311 Opt_notify_timeout,
312 Opt_last_int,
313 /* int args above */
314 Opt_last_string,
315 /* string args above */
316};
317
318static match_table_t rbdopt_tokens = {
319 {Opt_notify_timeout, "notify_timeout=%d"},
320 /* int args above */
321 /* string args above */
322 {-1, NULL}
323};
324
325static int parse_rbd_opts_token(char *c, void *private)
326{
327 struct rbd_options *rbdopt = private;
328 substring_t argstr[MAX_OPT_ARGS];
329 int token, intval, ret;
330
331 token = match_token((char *)c, rbdopt_tokens, argstr);
332 if (token < 0)
333 return -EINVAL;
334
335 if (token < Opt_last_int) {
336 ret = match_int(&argstr[0], &intval);
337 if (ret < 0) {
338 pr_err("bad mount option arg (not int) "
339 "at '%s'\n", c);
340 return ret;
341 }
342 dout("got int token %d val %d\n", token, intval);
343 } else if (token > Opt_last_int && token < Opt_last_string) {
344 dout("got string token %d val %s\n", token,
345 argstr[0].from);
346 } else {
347 dout("got token %d\n", token);
348 }
349
350 switch (token) {
351 case Opt_notify_timeout:
352 rbdopt->notify_timeout = intval;
353 break;
354 default:
355 BUG_ON(token);
356 }
357 return 0;
358}
359
360/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700361 * Get a ceph client with specific addr and configuration, if one does
362 * not exist create it.
363 */
364static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
365 char *options)
366{
367 struct rbd_client *rbdc;
368 struct ceph_options *opt;
369 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700370 struct rbd_options *rbd_opts;
371
372 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
373 if (!rbd_opts)
374 return -ENOMEM;
375
376 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377
378 ret = ceph_parse_options(&opt, options, mon_addr,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700379 mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700381 goto done_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382
383 spin_lock(&node_lock);
384 rbdc = __rbd_client_find(opt);
385 if (rbdc) {
386 ceph_destroy_options(opt);
387
388 /* using an existing client */
389 kref_get(&rbdc->kref);
390 rbd_dev->rbd_client = rbdc;
391 rbd_dev->client = rbdc->client;
392 spin_unlock(&node_lock);
393 return 0;
394 }
395 spin_unlock(&node_lock);
396
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700397 rbdc = rbd_client_create(opt, rbd_opts);
398 if (IS_ERR(rbdc)) {
399 ret = PTR_ERR(rbdc);
400 goto done_err;
401 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700402
403 rbd_dev->rbd_client = rbdc;
404 rbd_dev->client = rbdc->client;
405 return 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700406done_err:
407 kfree(rbd_opts);
408 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700409}
410
411/*
412 * Destroy ceph client
413 */
414static void rbd_client_release(struct kref *kref)
415{
416 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
417
418 dout("rbd_release_client %p\n", rbdc);
419 spin_lock(&node_lock);
420 list_del(&rbdc->node);
421 spin_unlock(&node_lock);
422
423 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700424 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700425 kfree(rbdc);
426}
427
428/*
429 * Drop reference to ceph client node. If it's not referenced anymore, release
430 * it.
431 */
432static void rbd_put_client(struct rbd_device *rbd_dev)
433{
434 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
435 rbd_dev->rbd_client = NULL;
436 rbd_dev->client = NULL;
437}
438
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700439/*
440 * Destroy requests collection
441 */
442static void rbd_coll_release(struct kref *kref)
443{
444 struct rbd_req_coll *coll =
445 container_of(kref, struct rbd_req_coll, kref);
446
447 dout("rbd_coll_release %p\n", coll);
448 kfree(coll);
449}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450
451/*
452 * Create a new header structure, translate header format from the on-disk
453 * header.
454 */
455static int rbd_header_from_disk(struct rbd_image_header *header,
456 struct rbd_image_header_ondisk *ondisk,
457 int allocated_snaps,
458 gfp_t gfp_flags)
459{
460 int i;
461 u32 snap_count = le32_to_cpu(ondisk->snap_count);
462 int ret = -ENOMEM;
463
Josh Durgin81e759f2011-11-15 14:49:53 -0800464 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) {
465 return -ENXIO;
466 }
467
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468 init_rwsem(&header->snap_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
470 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
471 snap_count *
472 sizeof(struct rbd_image_snap_ondisk),
473 gfp_flags);
474 if (!header->snapc)
475 return -ENOMEM;
476 if (snap_count) {
477 header->snap_names = kmalloc(header->snap_names_len,
478 GFP_KERNEL);
479 if (!header->snap_names)
480 goto err_snapc;
481 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
482 GFP_KERNEL);
483 if (!header->snap_sizes)
484 goto err_names;
485 } else {
486 header->snap_names = NULL;
487 header->snap_sizes = NULL;
488 }
489 memcpy(header->block_name, ondisk->block_name,
490 sizeof(ondisk->block_name));
491
492 header->image_size = le64_to_cpu(ondisk->image_size);
493 header->obj_order = ondisk->options.order;
494 header->crypt_type = ondisk->options.crypt_type;
495 header->comp_type = ondisk->options.comp_type;
496
497 atomic_set(&header->snapc->nref, 1);
498 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
499 header->snapc->num_snaps = snap_count;
500 header->total_snaps = snap_count;
501
502 if (snap_count &&
503 allocated_snaps == snap_count) {
504 for (i = 0; i < snap_count; i++) {
505 header->snapc->snaps[i] =
506 le64_to_cpu(ondisk->snaps[i].id);
507 header->snap_sizes[i] =
508 le64_to_cpu(ondisk->snaps[i].image_size);
509 }
510
511 /* copy snapshot names */
512 memcpy(header->snap_names, &ondisk->snaps[i],
513 header->snap_names_len);
514 }
515
516 return 0;
517
518err_names:
519 kfree(header->snap_names);
520err_snapc:
521 kfree(header->snapc);
522 return ret;
523}
524
525static int snap_index(struct rbd_image_header *header, int snap_num)
526{
527 return header->total_snaps - snap_num;
528}
529
530static u64 cur_snap_id(struct rbd_device *rbd_dev)
531{
532 struct rbd_image_header *header = &rbd_dev->header;
533
534 if (!rbd_dev->cur_snap)
535 return 0;
536
537 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
538}
539
540static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
541 u64 *seq, u64 *size)
542{
543 int i;
544 char *p = header->snap_names;
545
546 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
547 if (strcmp(snap_name, p) == 0)
548 break;
549 }
550 if (i == header->total_snaps)
551 return -ENOENT;
552 if (seq)
553 *seq = header->snapc->snaps[i];
554
555 if (size)
556 *size = header->snap_sizes[i];
557
558 return i;
559}
560
561static int rbd_header_set_snap(struct rbd_device *dev,
562 const char *snap_name,
563 u64 *size)
564{
565 struct rbd_image_header *header = &dev->header;
566 struct ceph_snap_context *snapc = header->snapc;
567 int ret = -ENOENT;
568
569 down_write(&header->snap_rwsem);
570
571 if (!snap_name ||
572 !*snap_name ||
573 strcmp(snap_name, "-") == 0 ||
574 strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
575 if (header->total_snaps)
576 snapc->seq = header->snap_seq;
577 else
578 snapc->seq = 0;
579 dev->cur_snap = 0;
580 dev->read_only = 0;
581 if (size)
582 *size = header->image_size;
583 } else {
584 ret = snap_by_name(header, snap_name, &snapc->seq, size);
585 if (ret < 0)
586 goto done;
587
588 dev->cur_snap = header->total_snaps - ret;
589 dev->read_only = 1;
590 }
591
592 ret = 0;
593done:
594 up_write(&header->snap_rwsem);
595 return ret;
596}
597
598static void rbd_header_free(struct rbd_image_header *header)
599{
600 kfree(header->snapc);
601 kfree(header->snap_names);
602 kfree(header->snap_sizes);
603}
604
605/*
606 * get the actual striped segment name, offset and length
607 */
608static u64 rbd_get_segment(struct rbd_image_header *header,
609 const char *block_name,
610 u64 ofs, u64 len,
611 char *seg_name, u64 *segofs)
612{
613 u64 seg = ofs >> header->obj_order;
614
615 if (seg_name)
616 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
617 "%s.%012llx", block_name, seg);
618
619 ofs = ofs & ((1 << header->obj_order) - 1);
620 len = min_t(u64, len, (1 << header->obj_order) - ofs);
621
622 if (segofs)
623 *segofs = ofs;
624
625 return len;
626}
627
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700628static int rbd_get_num_segments(struct rbd_image_header *header,
629 u64 ofs, u64 len)
630{
631 u64 start_seg = ofs >> header->obj_order;
632 u64 end_seg = (ofs + len - 1) >> header->obj_order;
633 return end_seg - start_seg + 1;
634}
635
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700637 * returns the size of an object in the image
638 */
639static u64 rbd_obj_bytes(struct rbd_image_header *header)
640{
641 return 1 << header->obj_order;
642}
643
644/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700645 * bio helpers
646 */
647
648static void bio_chain_put(struct bio *chain)
649{
650 struct bio *tmp;
651
652 while (chain) {
653 tmp = chain;
654 chain = chain->bi_next;
655 bio_put(tmp);
656 }
657}
658
659/*
660 * zeros a bio chain, starting at specific offset
661 */
662static void zero_bio_chain(struct bio *chain, int start_ofs)
663{
664 struct bio_vec *bv;
665 unsigned long flags;
666 void *buf;
667 int i;
668 int pos = 0;
669
670 while (chain) {
671 bio_for_each_segment(bv, chain, i) {
672 if (pos + bv->bv_len > start_ofs) {
673 int remainder = max(start_ofs - pos, 0);
674 buf = bvec_kmap_irq(bv, &flags);
675 memset(buf + remainder, 0,
676 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200677 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678 }
679 pos += bv->bv_len;
680 }
681
682 chain = chain->bi_next;
683 }
684}
685
686/*
687 * bio_chain_clone - clone a chain of bios up to a certain length.
688 * might return a bio_pair that will need to be released.
689 */
690static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
691 struct bio_pair **bp,
692 int len, gfp_t gfpmask)
693{
694 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
695 int total = 0;
696
697 if (*bp) {
698 bio_pair_release(*bp);
699 *bp = NULL;
700 }
701
702 while (old_chain && (total < len)) {
703 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
704 if (!tmp)
705 goto err_out;
706
707 if (total + old_chain->bi_size > len) {
708 struct bio_pair *bp;
709
710 /*
711 * this split can only happen with a single paged bio,
712 * split_bio will BUG_ON if this is not the case
713 */
714 dout("bio_chain_clone split! total=%d remaining=%d"
715 "bi_size=%d\n",
716 (int)total, (int)len-total,
717 (int)old_chain->bi_size);
718
719 /* split the bio. We'll release it either in the next
720 call, or it will have to be released outside */
721 bp = bio_split(old_chain, (len - total) / 512ULL);
722 if (!bp)
723 goto err_out;
724
725 __bio_clone(tmp, &bp->bio1);
726
727 *next = &bp->bio2;
728 } else {
729 __bio_clone(tmp, old_chain);
730 *next = old_chain->bi_next;
731 }
732
733 tmp->bi_bdev = NULL;
734 gfpmask &= ~__GFP_WAIT;
735 tmp->bi_next = NULL;
736
737 if (!new_chain) {
738 new_chain = tail = tmp;
739 } else {
740 tail->bi_next = tmp;
741 tail = tmp;
742 }
743 old_chain = old_chain->bi_next;
744
745 total += tmp->bi_size;
746 }
747
748 BUG_ON(total < len);
749
750 if (tail)
751 tail->bi_next = NULL;
752
753 *old = old_chain;
754
755 return new_chain;
756
757err_out:
758 dout("bio_chain_clone with err\n");
759 bio_chain_put(new_chain);
760 return NULL;
761}
762
763/*
764 * helpers for osd request op vectors.
765 */
766static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
767 int num_ops,
768 int opcode,
769 u32 payload_len)
770{
771 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
772 GFP_NOIO);
773 if (!*ops)
774 return -ENOMEM;
775 (*ops)[0].op = opcode;
776 /*
777 * op extent offset and length will be set later on
778 * in calc_raw_layout()
779 */
780 (*ops)[0].payload_len = payload_len;
781 return 0;
782}
783
784static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
785{
786 kfree(ops);
787}
788
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700789static void rbd_coll_end_req_index(struct request *rq,
790 struct rbd_req_coll *coll,
791 int index,
792 int ret, u64 len)
793{
794 struct request_queue *q;
795 int min, max, i;
796
797 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
798 coll, index, ret, len);
799
800 if (!rq)
801 return;
802
803 if (!coll) {
804 blk_end_request(rq, ret, len);
805 return;
806 }
807
808 q = rq->q;
809
810 spin_lock_irq(q->queue_lock);
811 coll->status[index].done = 1;
812 coll->status[index].rc = ret;
813 coll->status[index].bytes = len;
814 max = min = coll->num_done;
815 while (max < coll->total && coll->status[max].done)
816 max++;
817
818 for (i = min; i<max; i++) {
819 __blk_end_request(rq, coll->status[i].rc,
820 coll->status[i].bytes);
821 coll->num_done++;
822 kref_put(&coll->kref, rbd_coll_release);
823 }
824 spin_unlock_irq(q->queue_lock);
825}
826
827static void rbd_coll_end_req(struct rbd_request *req,
828 int ret, u64 len)
829{
830 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
831}
832
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700833/*
834 * Send ceph osd request
835 */
836static int rbd_do_request(struct request *rq,
837 struct rbd_device *dev,
838 struct ceph_snap_context *snapc,
839 u64 snapid,
840 const char *obj, u64 ofs, u64 len,
841 struct bio *bio,
842 struct page **pages,
843 int num_pages,
844 int flags,
845 struct ceph_osd_req_op *ops,
846 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700847 struct rbd_req_coll *coll,
848 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700849 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700850 struct ceph_msg *msg),
851 struct ceph_osd_request **linger_req,
852 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700853{
854 struct ceph_osd_request *req;
855 struct ceph_file_layout *layout;
856 int ret;
857 u64 bno;
858 struct timespec mtime = CURRENT_TIME;
859 struct rbd_request *req_data;
860 struct ceph_osd_request_head *reqhead;
861 struct rbd_image_header *header = &dev->header;
862
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700863 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700864 if (!req_data) {
865 if (coll)
866 rbd_coll_end_req_index(rq, coll, coll_index,
867 -ENOMEM, len);
868 return -ENOMEM;
869 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700870
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700871 if (coll) {
872 req_data->coll = coll;
873 req_data->coll_index = coll_index;
874 }
875
876 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700877
878 down_read(&header->snap_rwsem);
879
880 req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
881 snapc,
882 ops,
883 false,
884 GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700885 if (!req) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700886 up_read(&header->snap_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700887 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888 goto done_pages;
889 }
890
891 req->r_callback = rbd_cb;
892
893 req_data->rq = rq;
894 req_data->bio = bio;
895 req_data->pages = pages;
896 req_data->len = len;
897
898 req->r_priv = req_data;
899
900 reqhead = req->r_request->front.iov_base;
901 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
902
903 strncpy(req->r_oid, obj, sizeof(req->r_oid));
904 req->r_oid_len = strlen(req->r_oid);
905
906 layout = &req->r_file_layout;
907 memset(layout, 0, sizeof(*layout));
908 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
909 layout->fl_stripe_count = cpu_to_le32(1);
910 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
911 layout->fl_pg_preferred = cpu_to_le32(-1);
912 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
913 ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
914 ofs, &len, &bno, req, ops);
915
916 ceph_osdc_build_request(req, ofs, &len,
917 ops,
918 snapc,
919 &mtime,
920 req->r_oid, req->r_oid_len);
921 up_read(&header->snap_rwsem);
922
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700923 if (linger_req) {
924 ceph_osdc_set_request_linger(&dev->client->osdc, req);
925 *linger_req = req;
926 }
927
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700928 ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
929 if (ret < 0)
930 goto done_err;
931
932 if (!rbd_cb) {
933 ret = ceph_osdc_wait_request(&dev->client->osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700934 if (ver)
935 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700936 dout("reassert_ver=%lld\n",
937 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700938 ceph_osdc_put_request(req);
939 }
940 return ret;
941
942done_err:
943 bio_chain_put(req_data->bio);
944 ceph_osdc_put_request(req);
945done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700946 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700948 return ret;
949}
950
951/*
952 * Ceph osd op callback
953 */
954static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
955{
956 struct rbd_request *req_data = req->r_priv;
957 struct ceph_osd_reply_head *replyhead;
958 struct ceph_osd_op *op;
959 __s32 rc;
960 u64 bytes;
961 int read_op;
962
963 /* parse reply */
964 replyhead = msg->front.iov_base;
965 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
966 op = (void *)(replyhead + 1);
967 rc = le32_to_cpu(replyhead->result);
968 bytes = le64_to_cpu(op->extent.length);
969 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
970
971 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
972
973 if (rc == -ENOENT && read_op) {
974 zero_bio_chain(req_data->bio, 0);
975 rc = 0;
976 } else if (rc == 0 && read_op && bytes < req_data->len) {
977 zero_bio_chain(req_data->bio, bytes);
978 bytes = req_data->len;
979 }
980
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700981 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700982
983 if (req_data->bio)
984 bio_chain_put(req_data->bio);
985
986 ceph_osdc_put_request(req);
987 kfree(req_data);
988}
989
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700990static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
991{
992 ceph_osdc_put_request(req);
993}
994
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995/*
996 * Do a synchronous ceph osd operation
997 */
998static int rbd_req_sync_op(struct rbd_device *dev,
999 struct ceph_snap_context *snapc,
1000 u64 snapid,
1001 int opcode,
1002 int flags,
1003 struct ceph_osd_req_op *orig_ops,
1004 int num_reply,
1005 const char *obj,
1006 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001007 char *buf,
1008 struct ceph_osd_request **linger_req,
1009 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001010{
1011 int ret;
1012 struct page **pages;
1013 int num_pages;
1014 struct ceph_osd_req_op *ops = orig_ops;
1015 u32 payload_len;
1016
1017 num_pages = calc_pages_for(ofs , len);
1018 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001019 if (IS_ERR(pages))
1020 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001021
1022 if (!orig_ops) {
1023 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1024 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1025 if (ret < 0)
1026 goto done;
1027
1028 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1029 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1030 if (ret < 0)
1031 goto done_ops;
1032 }
1033 }
1034
1035 ret = rbd_do_request(NULL, dev, snapc, snapid,
1036 obj, ofs, len, NULL,
1037 pages, num_pages,
1038 flags,
1039 ops,
1040 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001041 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001042 NULL,
1043 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001044 if (ret < 0)
1045 goto done_ops;
1046
1047 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1048 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1049
1050done_ops:
1051 if (!orig_ops)
1052 rbd_destroy_ops(ops);
1053done:
1054 ceph_release_page_vector(pages, num_pages);
1055 return ret;
1056}
1057
1058/*
1059 * Do an asynchronous ceph osd operation
1060 */
1061static int rbd_do_op(struct request *rq,
1062 struct rbd_device *rbd_dev ,
1063 struct ceph_snap_context *snapc,
1064 u64 snapid,
1065 int opcode, int flags, int num_reply,
1066 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001067 struct bio *bio,
1068 struct rbd_req_coll *coll,
1069 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001070{
1071 char *seg_name;
1072 u64 seg_ofs;
1073 u64 seg_len;
1074 int ret;
1075 struct ceph_osd_req_op *ops;
1076 u32 payload_len;
1077
1078 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1079 if (!seg_name)
1080 return -ENOMEM;
1081
1082 seg_len = rbd_get_segment(&rbd_dev->header,
1083 rbd_dev->header.block_name,
1084 ofs, len,
1085 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001086
1087 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1088
1089 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1090 if (ret < 0)
1091 goto done;
1092
1093 /* we've taken care of segment sizes earlier when we
1094 cloned the bios. We should never have a segment
1095 truncated at this point */
1096 BUG_ON(seg_len < len);
1097
1098 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1099 seg_name, seg_ofs, seg_len,
1100 bio,
1101 NULL, 0,
1102 flags,
1103 ops,
1104 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001105 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001106 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001107
1108 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109done:
1110 kfree(seg_name);
1111 return ret;
1112}
1113
1114/*
1115 * Request async osd write
1116 */
1117static int rbd_req_write(struct request *rq,
1118 struct rbd_device *rbd_dev,
1119 struct ceph_snap_context *snapc,
1120 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001121 struct bio *bio,
1122 struct rbd_req_coll *coll,
1123 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001124{
1125 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1126 CEPH_OSD_OP_WRITE,
1127 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1128 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001129 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001130}
1131
1132/*
1133 * Request async osd read
1134 */
1135static int rbd_req_read(struct request *rq,
1136 struct rbd_device *rbd_dev,
1137 u64 snapid,
1138 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001139 struct bio *bio,
1140 struct rbd_req_coll *coll,
1141 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001142{
1143 return rbd_do_op(rq, rbd_dev, NULL,
1144 (snapid ? snapid : CEPH_NOSNAP),
1145 CEPH_OSD_OP_READ,
1146 CEPH_OSD_FLAG_READ,
1147 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001148 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149}
1150
1151/*
1152 * Request sync osd read
1153 */
1154static int rbd_req_sync_read(struct rbd_device *dev,
1155 struct ceph_snap_context *snapc,
1156 u64 snapid,
1157 const char *obj,
1158 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001159 char *buf,
1160 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161{
1162 return rbd_req_sync_op(dev, NULL,
1163 (snapid ? snapid : CEPH_NOSNAP),
1164 CEPH_OSD_OP_READ,
1165 CEPH_OSD_FLAG_READ,
1166 NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001167 1, obj, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168}
1169
1170/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001171 * Request sync osd watch
1172 */
1173static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1174 u64 ver,
1175 u64 notify_id,
1176 const char *obj)
1177{
1178 struct ceph_osd_req_op *ops;
1179 struct page **pages = NULL;
Sage Weil11f77002011-05-12 16:13:54 -07001180 int ret;
1181
1182 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001183 if (ret < 0)
1184 return ret;
1185
1186 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1187 ops[0].watch.cookie = notify_id;
1188 ops[0].watch.flag = 0;
1189
1190 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1191 obj, 0, 0, NULL,
1192 pages, 0,
1193 CEPH_OSD_FLAG_READ,
1194 ops,
1195 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001196 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001197 rbd_simple_req_cb, 0, NULL);
1198
1199 rbd_destroy_ops(ops);
1200 return ret;
1201}
1202
1203static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1204{
1205 struct rbd_device *dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001206 int rc;
1207
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001208 if (!dev)
1209 return;
1210
1211 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1212 notify_id, (int)opcode);
1213 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Sage Weil13143d22011-05-12 16:08:30 -07001214 rc = __rbd_update_snaps(dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001215 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001216 if (rc)
1217 pr_warning(DRV_NAME "%d got notification but failed to update"
1218 " snaps: %d\n", dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001219
1220 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1221}
1222
1223/*
1224 * Request sync osd watch
1225 */
1226static int rbd_req_sync_watch(struct rbd_device *dev,
1227 const char *obj,
1228 u64 ver)
1229{
1230 struct ceph_osd_req_op *ops;
1231 struct ceph_osd_client *osdc = &dev->client->osdc;
1232
1233 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1234 if (ret < 0)
1235 return ret;
1236
1237 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1238 (void *)dev, &dev->watch_event);
1239 if (ret < 0)
1240 goto fail;
1241
1242 ops[0].watch.ver = cpu_to_le64(ver);
1243 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1244 ops[0].watch.flag = 1;
1245
1246 ret = rbd_req_sync_op(dev, NULL,
1247 CEPH_NOSNAP,
1248 0,
1249 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1250 ops,
1251 1, obj, 0, 0, NULL,
1252 &dev->watch_request, NULL);
1253
1254 if (ret < 0)
1255 goto fail_event;
1256
1257 rbd_destroy_ops(ops);
1258 return 0;
1259
1260fail_event:
1261 ceph_osdc_cancel_event(dev->watch_event);
1262 dev->watch_event = NULL;
1263fail:
1264 rbd_destroy_ops(ops);
1265 return ret;
1266}
1267
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001268/*
1269 * Request sync osd unwatch
1270 */
1271static int rbd_req_sync_unwatch(struct rbd_device *dev,
1272 const char *obj)
1273{
1274 struct ceph_osd_req_op *ops;
1275
1276 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1277 if (ret < 0)
1278 return ret;
1279
1280 ops[0].watch.ver = 0;
1281 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1282 ops[0].watch.flag = 0;
1283
1284 ret = rbd_req_sync_op(dev, NULL,
1285 CEPH_NOSNAP,
1286 0,
1287 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1288 ops,
1289 1, obj, 0, 0, NULL, NULL, NULL);
1290
1291 rbd_destroy_ops(ops);
1292 ceph_osdc_cancel_event(dev->watch_event);
1293 dev->watch_event = NULL;
1294 return ret;
1295}
1296
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001297struct rbd_notify_info {
1298 struct rbd_device *dev;
1299};
1300
1301static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1302{
1303 struct rbd_device *dev = (struct rbd_device *)data;
1304 if (!dev)
1305 return;
1306
1307 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1308 notify_id, (int)opcode);
1309}
1310
1311/*
1312 * Request sync osd notify
1313 */
1314static int rbd_req_sync_notify(struct rbd_device *dev,
1315 const char *obj)
1316{
1317 struct ceph_osd_req_op *ops;
1318 struct ceph_osd_client *osdc = &dev->client->osdc;
1319 struct ceph_osd_event *event;
1320 struct rbd_notify_info info;
1321 int payload_len = sizeof(u32) + sizeof(u32);
1322 int ret;
1323
1324 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1325 if (ret < 0)
1326 return ret;
1327
1328 info.dev = dev;
1329
1330 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1331 (void *)&info, &event);
1332 if (ret < 0)
1333 goto fail;
1334
1335 ops[0].watch.ver = 1;
1336 ops[0].watch.flag = 1;
1337 ops[0].watch.cookie = event->cookie;
1338 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1339 ops[0].watch.timeout = 12;
1340
1341 ret = rbd_req_sync_op(dev, NULL,
1342 CEPH_NOSNAP,
1343 0,
1344 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1345 ops,
1346 1, obj, 0, 0, NULL, NULL, NULL);
1347 if (ret < 0)
1348 goto fail_event;
1349
1350 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1351 dout("ceph_osdc_wait_event returned %d\n", ret);
1352 rbd_destroy_ops(ops);
1353 return 0;
1354
1355fail_event:
1356 ceph_osdc_cancel_event(event);
1357fail:
1358 rbd_destroy_ops(ops);
1359 return ret;
1360}
1361
1362/*
1363 * Request sync osd rollback
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001364 */
1365static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
1366 u64 snapid,
1367 const char *obj)
1368{
1369 struct ceph_osd_req_op *ops;
1370 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
1371 if (ret < 0)
1372 return ret;
1373
1374 ops[0].snap.snapid = snapid;
1375
1376 ret = rbd_req_sync_op(dev, NULL,
1377 CEPH_NOSNAP,
1378 0,
1379 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1380 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001381 1, obj, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001382
1383 rbd_destroy_ops(ops);
1384
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001385 return ret;
1386}
1387
1388/*
1389 * Request sync osd read
1390 */
1391static int rbd_req_sync_exec(struct rbd_device *dev,
1392 const char *obj,
1393 const char *cls,
1394 const char *method,
1395 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001396 int len,
1397 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001398{
1399 struct ceph_osd_req_op *ops;
1400 int cls_len = strlen(cls);
1401 int method_len = strlen(method);
1402 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1403 cls_len + method_len + len);
1404 if (ret < 0)
1405 return ret;
1406
1407 ops[0].cls.class_name = cls;
1408 ops[0].cls.class_len = (__u8)cls_len;
1409 ops[0].cls.method_name = method;
1410 ops[0].cls.method_len = (__u8)method_len;
1411 ops[0].cls.argc = 0;
1412 ops[0].cls.indata = data;
1413 ops[0].cls.indata_len = len;
1414
1415 ret = rbd_req_sync_op(dev, NULL,
1416 CEPH_NOSNAP,
1417 0,
1418 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1419 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001420 1, obj, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001421
1422 rbd_destroy_ops(ops);
1423
1424 dout("cls_exec returned %d\n", ret);
1425 return ret;
1426}
1427
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001428static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1429{
1430 struct rbd_req_coll *coll =
1431 kzalloc(sizeof(struct rbd_req_coll) +
1432 sizeof(struct rbd_req_status) * num_reqs,
1433 GFP_ATOMIC);
1434
1435 if (!coll)
1436 return NULL;
1437 coll->total = num_reqs;
1438 kref_init(&coll->kref);
1439 return coll;
1440}
1441
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001442/*
1443 * block device queue callback
1444 */
1445static void rbd_rq_fn(struct request_queue *q)
1446{
1447 struct rbd_device *rbd_dev = q->queuedata;
1448 struct request *rq;
1449 struct bio_pair *bp = NULL;
1450
1451 rq = blk_fetch_request(q);
1452
1453 while (1) {
1454 struct bio *bio;
1455 struct bio *rq_bio, *next_bio = NULL;
1456 bool do_write;
1457 int size, op_size = 0;
1458 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001459 int num_segs, cur_seg = 0;
1460 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001461
1462 /* peek at request from block layer */
1463 if (!rq)
1464 break;
1465
1466 dout("fetched request\n");
1467
1468 /* filter out block requests we don't understand */
1469 if ((rq->cmd_type != REQ_TYPE_FS)) {
1470 __blk_end_request_all(rq, 0);
1471 goto next;
1472 }
1473
1474 /* deduce our operation (read, write) */
1475 do_write = (rq_data_dir(rq) == WRITE);
1476
1477 size = blk_rq_bytes(rq);
1478 ofs = blk_rq_pos(rq) * 512ULL;
1479 rq_bio = rq->bio;
1480 if (do_write && rbd_dev->read_only) {
1481 __blk_end_request_all(rq, -EROFS);
1482 goto next;
1483 }
1484
1485 spin_unlock_irq(q->queue_lock);
1486
1487 dout("%s 0x%x bytes at 0x%llx\n",
1488 do_write ? "write" : "read",
1489 size, blk_rq_pos(rq) * 512ULL);
1490
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001491 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1492 coll = rbd_alloc_coll(num_segs);
1493 if (!coll) {
1494 spin_lock_irq(q->queue_lock);
1495 __blk_end_request_all(rq, -ENOMEM);
1496 goto next;
1497 }
1498
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001499 do {
1500 /* a bio clone to be passed down to OSD req */
1501 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1502 op_size = rbd_get_segment(&rbd_dev->header,
1503 rbd_dev->header.block_name,
1504 ofs, size,
1505 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001506 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001507 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1508 op_size, GFP_ATOMIC);
1509 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001510 rbd_coll_end_req_index(rq, coll, cur_seg,
1511 -ENOMEM, op_size);
1512 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513 }
1514
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001515
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001516 /* init OSD command: write or read */
1517 if (do_write)
1518 rbd_req_write(rq, rbd_dev,
1519 rbd_dev->header.snapc,
1520 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521 op_size, bio,
1522 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001523 else
1524 rbd_req_read(rq, rbd_dev,
1525 cur_snap_id(rbd_dev),
1526 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001527 op_size, bio,
1528 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001530next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001531 size -= op_size;
1532 ofs += op_size;
1533
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001534 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 rq_bio = next_bio;
1536 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001537 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001538
1539 if (bp)
1540 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001541 spin_lock_irq(q->queue_lock);
1542next:
1543 rq = blk_fetch_request(q);
1544 }
1545}
1546
1547/*
1548 * a queue callback. Makes sure that we don't create a bio that spans across
1549 * multiple osd objects. One exception would be with a single page bios,
1550 * which we handle later at bio_chain_clone
1551 */
1552static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1553 struct bio_vec *bvec)
1554{
1555 struct rbd_device *rbd_dev = q->queuedata;
1556 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1557 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1558 unsigned int bio_sectors = bmd->bi_size >> 9;
1559 int max;
1560
1561 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1562 + bio_sectors)) << 9;
1563 if (max < 0)
1564 max = 0; /* bio_add cannot handle a negative return */
1565 if (max <= bvec->bv_len && bio_sectors == 0)
1566 return bvec->bv_len;
1567 return max;
1568}
1569
1570static void rbd_free_disk(struct rbd_device *rbd_dev)
1571{
1572 struct gendisk *disk = rbd_dev->disk;
1573
1574 if (!disk)
1575 return;
1576
1577 rbd_header_free(&rbd_dev->header);
1578
1579 if (disk->flags & GENHD_FL_UP)
1580 del_gendisk(disk);
1581 if (disk->queue)
1582 blk_cleanup_queue(disk->queue);
1583 put_disk(disk);
1584}
1585
1586/*
1587 * reload the ondisk the header
1588 */
1589static int rbd_read_header(struct rbd_device *rbd_dev,
1590 struct rbd_image_header *header)
1591{
1592 ssize_t rc;
1593 struct rbd_image_header_ondisk *dh;
1594 int snap_count = 0;
1595 u64 snap_names_len = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001596 u64 ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001597
1598 while (1) {
1599 int len = sizeof(*dh) +
1600 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1601 snap_names_len;
1602
1603 rc = -ENOMEM;
1604 dh = kmalloc(len, GFP_KERNEL);
1605 if (!dh)
1606 return -ENOMEM;
1607
1608 rc = rbd_req_sync_read(rbd_dev,
1609 NULL, CEPH_NOSNAP,
1610 rbd_dev->obj_md_name,
1611 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001612 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001613 if (rc < 0)
1614 goto out_dh;
1615
1616 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001617 if (rc < 0) {
1618 if (rc == -ENXIO) {
1619 pr_warning("unrecognized header format"
1620 " for image %s", rbd_dev->obj);
1621 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001622 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001623 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001624
1625 if (snap_count != header->total_snaps) {
1626 snap_count = header->total_snaps;
1627 snap_names_len = header->snap_names_len;
1628 rbd_header_free(header);
1629 kfree(dh);
1630 continue;
1631 }
1632 break;
1633 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001634 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001635
1636out_dh:
1637 kfree(dh);
1638 return rc;
1639}
1640
1641/*
1642 * create a snapshot
1643 */
1644static int rbd_header_add_snap(struct rbd_device *dev,
1645 const char *snap_name,
1646 gfp_t gfp_flags)
1647{
1648 int name_len = strlen(snap_name);
1649 u64 new_snapid;
1650 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001651 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001652 u64 ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001653
1654 /* we should create a snapshot only if we're pointing at the head */
1655 if (dev->cur_snap)
1656 return -EINVAL;
1657
1658 ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
1659 &new_snapid);
1660 dout("created snapid=%lld\n", new_snapid);
1661 if (ret < 0)
1662 return ret;
1663
1664 data = kmalloc(name_len + 16, gfp_flags);
1665 if (!data)
1666 return -ENOMEM;
1667
Sage Weil916d4d62011-05-12 16:10:50 -07001668 p = data;
1669 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001670
Sage Weil916d4d62011-05-12 16:10:50 -07001671 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1672 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001673
1674 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001675 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001676
Sage Weil916d4d62011-05-12 16:10:50 -07001677 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001678
1679 if (ret < 0)
1680 return ret;
1681
1682 dev->header.snapc->seq = new_snapid;
1683
1684 return 0;
1685bad:
1686 return -ERANGE;
1687}
1688
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001689static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1690{
1691 struct rbd_snap *snap;
1692
1693 while (!list_empty(&rbd_dev->snaps)) {
1694 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1695 __rbd_remove_snap_dev(rbd_dev, snap);
1696 }
1697}
1698
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001699/*
1700 * only read the first part of the ondisk header, without the snaps info
1701 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001702static int __rbd_update_snaps(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001703{
1704 int ret;
1705 struct rbd_image_header h;
1706 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001707 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001708
1709 ret = rbd_read_header(rbd_dev, &h);
1710 if (ret < 0)
1711 return ret;
1712
Sage Weil9db4b3e2011-04-19 22:49:06 -07001713 /* resized? */
1714 set_capacity(rbd_dev->disk, h.image_size / 512ULL);
1715
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001716 down_write(&rbd_dev->header.snap_rwsem);
1717
1718 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001719 if (rbd_dev->header.total_snaps &&
1720 rbd_dev->header.snapc->snaps[0] == snap_seq)
1721 /* pointing at the head, will need to follow that
1722 if head moves */
1723 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001724
1725 kfree(rbd_dev->header.snapc);
1726 kfree(rbd_dev->header.snap_names);
1727 kfree(rbd_dev->header.snap_sizes);
1728
1729 rbd_dev->header.total_snaps = h.total_snaps;
1730 rbd_dev->header.snapc = h.snapc;
1731 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001732 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001733 rbd_dev->header.snap_sizes = h.snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001734 if (follow_seq)
1735 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1736 else
1737 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001738
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001739 ret = __rbd_init_snaps_header(rbd_dev);
1740
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001741 up_write(&rbd_dev->header.snap_rwsem);
1742
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001743 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744}
1745
1746static int rbd_init_disk(struct rbd_device *rbd_dev)
1747{
1748 struct gendisk *disk;
1749 struct request_queue *q;
1750 int rc;
1751 u64 total_size = 0;
1752
1753 /* contact OSD, request size info about the object being mapped */
1754 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1755 if (rc)
1756 return rc;
1757
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001758 /* no need to lock here, as rbd_dev is not registered yet */
1759 rc = __rbd_init_snaps_header(rbd_dev);
1760 if (rc)
1761 return rc;
1762
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001763 rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
1764 if (rc)
1765 return rc;
1766
1767 /* create gendisk info */
1768 rc = -ENOMEM;
1769 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1770 if (!disk)
1771 goto out;
1772
Sage Weilaedfec52011-05-12 20:57:03 -07001773 snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
1774 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775 disk->major = rbd_dev->major;
1776 disk->first_minor = 0;
1777 disk->fops = &rbd_bd_ops;
1778 disk->private_data = rbd_dev;
1779
1780 /* init rq */
1781 rc = -ENOMEM;
1782 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1783 if (!q)
1784 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001785
1786 /* set io sizes to object size */
1787 blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
1788 blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
1789 blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
1790 blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
1791
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001792 blk_queue_merge_bvec(q, rbd_merge_bvec);
1793 disk->queue = q;
1794
1795 q->queuedata = rbd_dev;
1796
1797 rbd_dev->disk = disk;
1798 rbd_dev->q = q;
1799
1800 /* finally, announce the disk to the world */
1801 set_capacity(disk, total_size / 512ULL);
1802 add_disk(disk);
1803
1804 pr_info("%s: added with size 0x%llx\n",
1805 disk->disk_name, (unsigned long long)total_size);
1806 return 0;
1807
1808out_disk:
1809 put_disk(disk);
1810out:
1811 return rc;
1812}
1813
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001814/*
1815 sysfs
1816*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001817
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001818static ssize_t rbd_size_show(struct device *dev,
1819 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001820{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001821 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1822
1823 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001824}
1825
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001826static ssize_t rbd_major_show(struct device *dev,
1827 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001828{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001829 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1830
1831 return sprintf(buf, "%d\n", rbd_dev->major);
1832}
1833
1834static ssize_t rbd_client_id_show(struct device *dev,
1835 struct device_attribute *attr, char *buf)
1836{
1837 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1838
1839 return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
1840}
1841
1842static ssize_t rbd_pool_show(struct device *dev,
1843 struct device_attribute *attr, char *buf)
1844{
1845 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1846
1847 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1848}
1849
1850static ssize_t rbd_name_show(struct device *dev,
1851 struct device_attribute *attr, char *buf)
1852{
1853 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1854
1855 return sprintf(buf, "%s\n", rbd_dev->obj);
1856}
1857
1858static ssize_t rbd_snap_show(struct device *dev,
1859 struct device_attribute *attr,
1860 char *buf)
1861{
1862 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1863
1864 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1865}
1866
1867static ssize_t rbd_image_refresh(struct device *dev,
1868 struct device_attribute *attr,
1869 const char *buf,
1870 size_t size)
1871{
1872 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1873 int rc;
1874 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001875
1876 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1877
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001878 rc = __rbd_update_snaps(rbd_dev);
1879 if (rc < 0)
1880 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001881
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001882 mutex_unlock(&ctl_mutex);
1883 return ret;
1884}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001885
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001886static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1887static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1888static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1889static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1890static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1891static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1892static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1893static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1894static DEVICE_ATTR(rollback_snap, S_IWUSR, NULL, rbd_snap_rollback);
1895
1896static struct attribute *rbd_attrs[] = {
1897 &dev_attr_size.attr,
1898 &dev_attr_major.attr,
1899 &dev_attr_client_id.attr,
1900 &dev_attr_pool.attr,
1901 &dev_attr_name.attr,
1902 &dev_attr_current_snap.attr,
1903 &dev_attr_refresh.attr,
1904 &dev_attr_create_snap.attr,
1905 &dev_attr_rollback_snap.attr,
1906 NULL
1907};
1908
1909static struct attribute_group rbd_attr_group = {
1910 .attrs = rbd_attrs,
1911};
1912
1913static const struct attribute_group *rbd_attr_groups[] = {
1914 &rbd_attr_group,
1915 NULL
1916};
1917
1918static void rbd_sysfs_dev_release(struct device *dev)
1919{
1920}
1921
1922static struct device_type rbd_device_type = {
1923 .name = "rbd",
1924 .groups = rbd_attr_groups,
1925 .release = rbd_sysfs_dev_release,
1926};
1927
1928
1929/*
1930 sysfs - snapshots
1931*/
1932
1933static ssize_t rbd_snap_size_show(struct device *dev,
1934 struct device_attribute *attr,
1935 char *buf)
1936{
1937 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1938
1939 return sprintf(buf, "%lld\n", (long long)snap->size);
1940}
1941
1942static ssize_t rbd_snap_id_show(struct device *dev,
1943 struct device_attribute *attr,
1944 char *buf)
1945{
1946 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1947
1948 return sprintf(buf, "%lld\n", (long long)snap->id);
1949}
1950
1951static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1952static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1953
1954static struct attribute *rbd_snap_attrs[] = {
1955 &dev_attr_snap_size.attr,
1956 &dev_attr_snap_id.attr,
1957 NULL,
1958};
1959
1960static struct attribute_group rbd_snap_attr_group = {
1961 .attrs = rbd_snap_attrs,
1962};
1963
1964static void rbd_snap_dev_release(struct device *dev)
1965{
1966 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1967 kfree(snap->name);
1968 kfree(snap);
1969}
1970
1971static const struct attribute_group *rbd_snap_attr_groups[] = {
1972 &rbd_snap_attr_group,
1973 NULL
1974};
1975
1976static struct device_type rbd_snap_device_type = {
1977 .groups = rbd_snap_attr_groups,
1978 .release = rbd_snap_dev_release,
1979};
1980
1981static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1982 struct rbd_snap *snap)
1983{
1984 list_del(&snap->node);
1985 device_unregister(&snap->dev);
1986}
1987
1988static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1989 struct rbd_snap *snap,
1990 struct device *parent)
1991{
1992 struct device *dev = &snap->dev;
1993 int ret;
1994
1995 dev->type = &rbd_snap_device_type;
1996 dev->parent = parent;
1997 dev->release = rbd_snap_dev_release;
1998 dev_set_name(dev, "snap_%s", snap->name);
1999 ret = device_register(dev);
2000
2001 return ret;
2002}
2003
2004static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2005 int i, const char *name,
2006 struct rbd_snap **snapp)
2007{
2008 int ret;
2009 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2010 if (!snap)
2011 return -ENOMEM;
2012 snap->name = kstrdup(name, GFP_KERNEL);
2013 snap->size = rbd_dev->header.snap_sizes[i];
2014 snap->id = rbd_dev->header.snapc->snaps[i];
2015 if (device_is_registered(&rbd_dev->dev)) {
2016 ret = rbd_register_snap_dev(rbd_dev, snap,
2017 &rbd_dev->dev);
2018 if (ret < 0)
2019 goto err;
2020 }
2021 *snapp = snap;
2022 return 0;
2023err:
2024 kfree(snap->name);
2025 kfree(snap);
2026 return ret;
2027}
2028
2029/*
2030 * search for the previous snap in a null delimited string list
2031 */
2032const char *rbd_prev_snap_name(const char *name, const char *start)
2033{
2034 if (name < start + 2)
2035 return NULL;
2036
2037 name -= 2;
2038 while (*name) {
2039 if (name == start)
2040 return start;
2041 name--;
2042 }
2043 return name + 1;
2044}
2045
2046/*
2047 * compare the old list of snapshots that we have to what's in the header
2048 * and update it accordingly. Note that the header holds the snapshots
2049 * in a reverse order (from newest to oldest) and we need to go from
2050 * older to new so that we don't get a duplicate snap name when
2051 * doing the process (e.g., removed snapshot and recreated a new
2052 * one with the same name.
2053 */
2054static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2055{
2056 const char *name, *first_name;
2057 int i = rbd_dev->header.total_snaps;
2058 struct rbd_snap *snap, *old_snap = NULL;
2059 int ret;
2060 struct list_head *p, *n;
2061
2062 first_name = rbd_dev->header.snap_names;
2063 name = first_name + rbd_dev->header.snap_names_len;
2064
2065 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2066 u64 cur_id;
2067
2068 old_snap = list_entry(p, struct rbd_snap, node);
2069
2070 if (i)
2071 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2072
2073 if (!i || old_snap->id < cur_id) {
2074 /* old_snap->id was skipped, thus was removed */
2075 __rbd_remove_snap_dev(rbd_dev, old_snap);
2076 continue;
2077 }
2078 if (old_snap->id == cur_id) {
2079 /* we have this snapshot already */
2080 i--;
2081 name = rbd_prev_snap_name(name, first_name);
2082 continue;
2083 }
2084 for (; i > 0;
2085 i--, name = rbd_prev_snap_name(name, first_name)) {
2086 if (!name) {
2087 WARN_ON(1);
2088 return -EINVAL;
2089 }
2090 cur_id = rbd_dev->header.snapc->snaps[i];
2091 /* snapshot removal? handle it above */
2092 if (cur_id >= old_snap->id)
2093 break;
2094 /* a new snapshot */
2095 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2096 if (ret < 0)
2097 return ret;
2098
2099 /* note that we add it backward so using n and not p */
2100 list_add(&snap->node, n);
2101 p = &snap->node;
2102 }
2103 }
2104 /* we're done going over the old snap list, just add what's left */
2105 for (; i > 0; i--) {
2106 name = rbd_prev_snap_name(name, first_name);
2107 if (!name) {
2108 WARN_ON(1);
2109 return -EINVAL;
2110 }
2111 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2112 if (ret < 0)
2113 return ret;
2114 list_add(&snap->node, &rbd_dev->snaps);
2115 }
2116
2117 return 0;
2118}
2119
2120
2121static void rbd_root_dev_release(struct device *dev)
2122{
2123}
2124
2125static struct device rbd_root_dev = {
2126 .init_name = "rbd",
2127 .release = rbd_root_dev_release,
2128};
2129
2130static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2131{
2132 int ret = -ENOMEM;
2133 struct device *dev;
2134 struct rbd_snap *snap;
2135
2136 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2137 dev = &rbd_dev->dev;
2138
2139 dev->bus = &rbd_bus_type;
2140 dev->type = &rbd_device_type;
2141 dev->parent = &rbd_root_dev;
2142 dev->release = rbd_dev_release;
2143 dev_set_name(dev, "%d", rbd_dev->id);
2144 ret = device_register(dev);
2145 if (ret < 0)
2146 goto done_free;
2147
2148 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2149 ret = rbd_register_snap_dev(rbd_dev, snap,
2150 &rbd_dev->dev);
2151 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002152 break;
2153 }
2154
2155 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002156 return 0;
2157done_free:
2158 mutex_unlock(&ctl_mutex);
2159 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002160}
2161
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002162static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2163{
2164 device_unregister(&rbd_dev->dev);
2165}
2166
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002167static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2168{
2169 int ret, rc;
2170
2171 do {
2172 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2173 rbd_dev->header.obj_version);
2174 if (ret == -ERANGE) {
2175 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2176 rc = __rbd_update_snaps(rbd_dev);
2177 mutex_unlock(&ctl_mutex);
2178 if (rc < 0)
2179 return rc;
2180 }
2181 } while (ret == -ERANGE);
2182
2183 return ret;
2184}
2185
2186static ssize_t rbd_add(struct bus_type *bus,
2187 const char *buf,
2188 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002189{
2190 struct ceph_osd_client *osdc;
2191 struct rbd_device *rbd_dev;
2192 ssize_t rc = -ENOMEM;
2193 int irc, new_id = 0;
2194 struct list_head *tmp;
2195 char *mon_dev_name;
2196 char *options;
2197
2198 if (!try_module_get(THIS_MODULE))
2199 return -ENODEV;
2200
2201 mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2202 if (!mon_dev_name)
2203 goto err_out_mod;
2204
2205 options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2206 if (!options)
2207 goto err_mon_dev;
2208
2209 /* new rbd_device object */
2210 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2211 if (!rbd_dev)
2212 goto err_out_opt;
2213
2214 /* static rbd_device initialization */
2215 spin_lock_init(&rbd_dev->lock);
2216 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002217 INIT_LIST_HEAD(&rbd_dev->snaps);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002218
2219 /* generate unique id: find highest unique id, add one */
2220 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2221
2222 list_for_each(tmp, &rbd_dev_list) {
2223 struct rbd_device *rbd_dev;
2224
2225 rbd_dev = list_entry(tmp, struct rbd_device, node);
2226 if (rbd_dev->id >= new_id)
2227 new_id = rbd_dev->id + 1;
2228 }
2229
2230 rbd_dev->id = new_id;
2231
2232 /* add to global list */
2233 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2234
2235 /* parse add command */
2236 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
2237 "%" __stringify(RBD_MAX_OPT_LEN) "s "
2238 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
2239 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
2240 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
2241 mon_dev_name, options, rbd_dev->pool_name,
2242 rbd_dev->obj, rbd_dev->snap_name) < 4) {
2243 rc = -EINVAL;
2244 goto err_out_slot;
2245 }
2246
2247 if (rbd_dev->snap_name[0] == 0)
2248 rbd_dev->snap_name[0] = '-';
2249
2250 rbd_dev->obj_len = strlen(rbd_dev->obj);
2251 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
2252 rbd_dev->obj, RBD_SUFFIX);
2253
2254 /* initialize rest of new object */
2255 snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
2256 rc = rbd_get_client(rbd_dev, mon_dev_name, options);
2257 if (rc < 0)
2258 goto err_out_slot;
2259
2260 mutex_unlock(&ctl_mutex);
2261
2262 /* pick the pool */
2263 osdc = &rbd_dev->client->osdc;
2264 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2265 if (rc < 0)
2266 goto err_out_client;
2267 rbd_dev->poolid = rc;
2268
2269 /* register our block device */
2270 irc = register_blkdev(0, rbd_dev->name);
2271 if (irc < 0) {
2272 rc = irc;
2273 goto err_out_client;
2274 }
2275 rbd_dev->major = irc;
2276
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002277 rc = rbd_bus_add_dev(rbd_dev);
2278 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002279 goto err_out_blkdev;
2280
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002281 /* set up and announce blkdev mapping */
2282 rc = rbd_init_disk(rbd_dev);
2283 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002284 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002285
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002286 rc = rbd_init_watch_dev(rbd_dev);
2287 if (rc)
2288 goto err_out_bus;
2289
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002290 return count;
2291
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002292err_out_bus:
2293 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2294 list_del_init(&rbd_dev->node);
2295 mutex_unlock(&ctl_mutex);
2296
2297 /* this will also clean up rest of rbd_dev stuff */
2298
2299 rbd_bus_del_dev(rbd_dev);
2300 kfree(options);
2301 kfree(mon_dev_name);
2302 return rc;
2303
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002304err_out_blkdev:
2305 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2306err_out_client:
2307 rbd_put_client(rbd_dev);
2308 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2309err_out_slot:
2310 list_del_init(&rbd_dev->node);
2311 mutex_unlock(&ctl_mutex);
2312
2313 kfree(rbd_dev);
2314err_out_opt:
2315 kfree(options);
2316err_mon_dev:
2317 kfree(mon_dev_name);
2318err_out_mod:
2319 dout("Error adding device %s\n", buf);
2320 module_put(THIS_MODULE);
2321 return rc;
2322}
2323
2324static struct rbd_device *__rbd_get_dev(unsigned long id)
2325{
2326 struct list_head *tmp;
2327 struct rbd_device *rbd_dev;
2328
2329 list_for_each(tmp, &rbd_dev_list) {
2330 rbd_dev = list_entry(tmp, struct rbd_device, node);
2331 if (rbd_dev->id == id)
2332 return rbd_dev;
2333 }
2334 return NULL;
2335}
2336
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002337static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002338{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002339 struct rbd_device *rbd_dev =
2340 container_of(dev, struct rbd_device, dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002341
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002342 if (rbd_dev->watch_request)
2343 ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
2344 rbd_dev->watch_request);
2345 if (rbd_dev->watch_event)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07002346 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002347
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002348 rbd_put_client(rbd_dev);
2349
2350 /* clean up and free blkdev */
2351 rbd_free_disk(rbd_dev);
2352 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2353 kfree(rbd_dev);
2354
2355 /* release module ref */
2356 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002357}
2358
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002359static ssize_t rbd_remove(struct bus_type *bus,
2360 const char *buf,
2361 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002362{
2363 struct rbd_device *rbd_dev = NULL;
2364 int target_id, rc;
2365 unsigned long ul;
2366 int ret = count;
2367
2368 rc = strict_strtoul(buf, 10, &ul);
2369 if (rc)
2370 return rc;
2371
2372 /* convert to int; abort if we lost anything in the conversion */
2373 target_id = (int) ul;
2374 if (target_id != ul)
2375 return -EINVAL;
2376
2377 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2378
2379 rbd_dev = __rbd_get_dev(target_id);
2380 if (!rbd_dev) {
2381 ret = -ENOENT;
2382 goto done;
2383 }
2384
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002385 list_del_init(&rbd_dev->node);
2386
2387 __rbd_remove_all_snaps(rbd_dev);
2388 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002389
2390done:
2391 mutex_unlock(&ctl_mutex);
2392 return ret;
2393}
2394
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002395static ssize_t rbd_snap_add(struct device *dev,
2396 struct device_attribute *attr,
2397 const char *buf,
2398 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002399{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002400 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2401 int ret;
2402 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002403 if (!name)
2404 return -ENOMEM;
2405
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002406 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002407
2408 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2409
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002410 ret = rbd_header_add_snap(rbd_dev,
2411 name, GFP_KERNEL);
2412 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002413 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002414
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002415 ret = __rbd_update_snaps(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002416 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002417 goto err_unlock;
2418
2419 /* shouldn't hold ctl_mutex when notifying.. notify might
2420 trigger a watch callback that would need to get that mutex */
2421 mutex_unlock(&ctl_mutex);
2422
2423 /* make a best effort, don't error if failed */
2424 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002425
2426 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002427 kfree(name);
2428 return ret;
2429
2430err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002431 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002432 kfree(name);
2433 return ret;
2434}
2435
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002436static ssize_t rbd_snap_rollback(struct device *dev,
2437 struct device_attribute *attr,
2438 const char *buf,
2439 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002440{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002441 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2442 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002443 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002444 u64 cur_ofs;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002445 char *seg_name = NULL;
2446 char *snap_name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002447 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002448 if (!snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002449 return ret;
2450
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002451 /* parse snaps add command */
2452 snprintf(snap_name, count, "%s", buf);
2453 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
2454 if (!seg_name)
2455 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002456
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002457 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002458
2459 ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
2460 if (ret < 0)
2461 goto done_unlock;
2462
2463 dout("snapid=%lld\n", snapid);
2464
2465 cur_ofs = 0;
2466 while (cur_ofs < rbd_dev->header.image_size) {
2467 cur_ofs += rbd_get_segment(&rbd_dev->header,
2468 rbd_dev->obj,
2469 cur_ofs, (u64)-1,
2470 seg_name, NULL);
2471 dout("seg_name=%s\n", seg_name);
2472
2473 ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
2474 if (ret < 0)
2475 pr_warning("could not roll back obj %s err=%d\n",
2476 seg_name, ret);
2477 }
2478
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002479 ret = __rbd_update_snaps(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002480 if (ret < 0)
2481 goto done_unlock;
2482
2483 ret = count;
2484
2485done_unlock:
2486 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002487done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002488 kfree(seg_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002489 kfree(snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002490
2491 return ret;
2492}
2493
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002494static struct bus_attribute rbd_bus_attrs[] = {
2495 __ATTR(add, S_IWUSR, NULL, rbd_add),
2496 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002497 __ATTR_NULL
2498};
2499
2500/*
2501 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002502 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002503 */
2504static int rbd_sysfs_init(void)
2505{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002506 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002507
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002508 rbd_bus_type.bus_attrs = rbd_bus_attrs;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002509
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002510 ret = bus_register(&rbd_bus_type);
2511 if (ret < 0)
2512 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002513
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002514 ret = device_register(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002515
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002516 return ret;
2517}
2518
2519static void rbd_sysfs_cleanup(void)
2520{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002521 device_unregister(&rbd_root_dev);
2522 bus_unregister(&rbd_bus_type);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002523}
2524
2525int __init rbd_init(void)
2526{
2527 int rc;
2528
2529 rc = rbd_sysfs_init();
2530 if (rc)
2531 return rc;
2532 spin_lock_init(&node_lock);
2533 pr_info("loaded " DRV_NAME_LONG "\n");
2534 return 0;
2535}
2536
2537void __exit rbd_exit(void)
2538{
2539 rbd_sysfs_cleanup();
2540}
2541
2542module_init(rbd_init);
2543module_exit(rbd_exit);
2544
2545MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2546MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2547MODULE_DESCRIPTION("rados block device");
2548
2549/* following authorship retained from original osdblk.c */
2550MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2551
2552MODULE_LICENSE("GPL");