blob: bccd350a0323cb6c50e34e251a384209a78ee6f1 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
44#define DRV_NAME "rbd"
45#define DRV_NAME_LONG "rbd (rados block device)"
46
47#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
48
Alex Elder21079782012-01-24 10:08:36 -060049#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
Yehuda Sadeh602adf42010-08-12 16:11:25 -070050#define RBD_MAX_POOL_NAME_LEN 64
51#define RBD_MAX_SNAP_NAME_LEN 32
52#define RBD_MAX_OPT_LEN 1024
53
54#define RBD_SNAP_HEAD_NAME "-"
55
56#define DEV_NAME_LEN 32
57
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070058#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
59
Yehuda Sadeh602adf42010-08-12 16:11:25 -070060/*
61 * block device image metadata (in-memory version)
62 */
63struct rbd_image_header {
64 u64 image_size;
65 char block_name[32];
66 __u8 obj_order;
67 __u8 crypt_type;
68 __u8 comp_type;
69 struct rw_semaphore snap_rwsem;
70 struct ceph_snap_context *snapc;
71 size_t snap_names_len;
72 u64 snap_seq;
73 u32 total_snaps;
74
75 char *snap_names;
76 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070077
78 u64 obj_version;
79};
80
81struct rbd_options {
82 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083};
84
85/*
86 * an instance of the client. multiple devices may share a client.
87 */
88struct rbd_client {
89 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091 struct kref kref;
92 struct list_head node;
93};
94
Yehuda Sadeh1fec7092011-05-13 13:52:56 -070095struct rbd_req_coll;
96
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097/*
98 * a single io request
99 */
100struct rbd_request {
101 struct request *rq; /* blk layer request */
102 struct bio *bio; /* cloned bio */
103 struct page **pages; /* list of used pages */
104 u64 len;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700105 int coll_index;
106 struct rbd_req_coll *coll;
107};
108
109struct rbd_req_status {
110 int done;
111 int rc;
112 u64 bytes;
113};
114
115/*
116 * a collection of requests
117 */
118struct rbd_req_coll {
119 int total;
120 int num_done;
121 struct kref kref;
122 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700123};
124
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800125struct rbd_snap {
126 struct device dev;
127 const char *name;
128 size_t size;
129 struct list_head node;
130 u64 id;
131};
132
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700133/*
134 * a single device
135 */
136struct rbd_device {
137 int id; /* blkdev unique id */
138
139 int major; /* blkdev assigned major */
140 struct gendisk *disk; /* blkdev's gendisk and rq */
141 struct request_queue *q;
142
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143 struct rbd_client *rbd_client;
144
145 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
146
147 spinlock_t lock; /* queue lock */
148
149 struct rbd_image_header header;
150 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
151 int obj_len;
152 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
153 char pool_name[RBD_MAX_POOL_NAME_LEN];
154 int poolid;
155
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700156 struct ceph_osd_event *watch_event;
157 struct ceph_osd_request *watch_request;
158
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700159 char snap_name[RBD_MAX_SNAP_NAME_LEN];
160 u32 cur_snap; /* index+1 of current snapshot within snap context
161 0 - for the head */
162 int read_only;
163
164 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800165
166 /* list of snapshots */
167 struct list_head snaps;
168
169 /* sysfs related */
170 struct device dev;
171};
172
173static struct bus_type rbd_bus_type = {
174 .name = "rbd",
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700175};
176
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700177static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600178
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700179static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600180static DEFINE_SPINLOCK(rbd_dev_list_lock);
181
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182static LIST_HEAD(rbd_client_list); /* clients */
Alex Eldere124a822012-01-29 13:57:44 -0600183static DEFINE_SPINLOCK(node_lock); /* protects client get/put */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700184
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800185static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
186static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800187static ssize_t rbd_snap_add(struct device *dev,
188 struct device_attribute *attr,
189 const char *buf,
190 size_t count);
191static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700192 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800193
194
195static struct rbd_device *dev_to_rbd(struct device *dev)
196{
197 return container_of(dev, struct rbd_device, dev);
198}
199
200static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
201{
202 return get_device(&rbd_dev->dev);
203}
204
205static void rbd_put_dev(struct rbd_device *rbd_dev)
206{
207 put_device(&rbd_dev->dev);
208}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700209
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700210static int __rbd_update_snaps(struct rbd_device *rbd_dev);
211
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700212static int rbd_open(struct block_device *bdev, fmode_t mode)
213{
214 struct gendisk *disk = bdev->bd_disk;
215 struct rbd_device *rbd_dev = disk->private_data;
216
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800217 rbd_get_dev(rbd_dev);
218
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700219 set_device_ro(bdev, rbd_dev->read_only);
220
221 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
222 return -EROFS;
223
224 return 0;
225}
226
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800227static int rbd_release(struct gendisk *disk, fmode_t mode)
228{
229 struct rbd_device *rbd_dev = disk->private_data;
230
231 rbd_put_dev(rbd_dev);
232
233 return 0;
234}
235
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700236static const struct block_device_operations rbd_bd_ops = {
237 .owner = THIS_MODULE,
238 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700240};
241
242/*
243 * Initialize an rbd client instance.
244 * We own *opt.
245 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700246static struct rbd_client *rbd_client_create(struct ceph_options *opt,
247 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248{
249 struct rbd_client *rbdc;
250 int ret = -ENOMEM;
251
252 dout("rbd_client_create\n");
253 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
254 if (!rbdc)
255 goto out_opt;
256
257 kref_init(&rbdc->kref);
258 INIT_LIST_HEAD(&rbdc->node);
259
Sage Weil6ab00d42011-08-09 09:41:59 -0700260 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700261 if (IS_ERR(rbdc->client))
262 goto out_rbdc;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400263 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700264
265 ret = ceph_open_session(rbdc->client);
266 if (ret < 0)
267 goto out_err;
268
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700269 rbdc->rbd_opts = rbd_opts;
270
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700271 spin_lock(&node_lock);
272 list_add_tail(&rbdc->node, &rbd_client_list);
273 spin_unlock(&node_lock);
274
275 dout("rbd_client_create created %p\n", rbdc);
276 return rbdc;
277
278out_err:
279 ceph_destroy_client(rbdc->client);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700280out_rbdc:
281 kfree(rbdc);
282out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400283 if (opt)
284 ceph_destroy_options(opt);
285 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700286}
287
288/*
289 * Find a ceph client with specific addr and configuration.
290 */
291static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
292{
293 struct rbd_client *client_node;
294
295 if (opt->flags & CEPH_OPT_NOSHARE)
296 return NULL;
297
298 list_for_each_entry(client_node, &rbd_client_list, node)
299 if (ceph_compare_options(opt, client_node->client) == 0)
300 return client_node;
301 return NULL;
302}
303
304/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700305 * mount options
306 */
307enum {
308 Opt_notify_timeout,
309 Opt_last_int,
310 /* int args above */
311 Opt_last_string,
312 /* string args above */
313};
314
315static match_table_t rbdopt_tokens = {
316 {Opt_notify_timeout, "notify_timeout=%d"},
317 /* int args above */
318 /* string args above */
319 {-1, NULL}
320};
321
322static int parse_rbd_opts_token(char *c, void *private)
323{
324 struct rbd_options *rbdopt = private;
325 substring_t argstr[MAX_OPT_ARGS];
326 int token, intval, ret;
327
Alex Elder21079782012-01-24 10:08:36 -0600328 token = match_token(c, rbdopt_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700329 if (token < 0)
330 return -EINVAL;
331
332 if (token < Opt_last_int) {
333 ret = match_int(&argstr[0], &intval);
334 if (ret < 0) {
335 pr_err("bad mount option arg (not int) "
336 "at '%s'\n", c);
337 return ret;
338 }
339 dout("got int token %d val %d\n", token, intval);
340 } else if (token > Opt_last_int && token < Opt_last_string) {
341 dout("got string token %d val %s\n", token,
342 argstr[0].from);
343 } else {
344 dout("got token %d\n", token);
345 }
346
347 switch (token) {
348 case Opt_notify_timeout:
349 rbdopt->notify_timeout = intval;
350 break;
351 default:
352 BUG_ON(token);
353 }
354 return 0;
355}
356
357/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358 * Get a ceph client with specific addr and configuration, if one does
359 * not exist create it.
360 */
361static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
362 char *options)
363{
364 struct rbd_client *rbdc;
365 struct ceph_options *opt;
366 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700367 struct rbd_options *rbd_opts;
368
369 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
370 if (!rbd_opts)
371 return -ENOMEM;
372
373 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700374
Alex Elderee577412012-01-24 10:08:36 -0600375 opt = ceph_parse_options(options, mon_addr,
Alex Elder21079782012-01-24 10:08:36 -0600376 mon_addr + strlen(mon_addr),
377 parse_rbd_opts_token, rbd_opts);
Alex Elderee577412012-01-24 10:08:36 -0600378 if (IS_ERR(opt)) {
379 ret = PTR_ERR(opt);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700380 goto done_err;
Alex Elderee577412012-01-24 10:08:36 -0600381 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382
383 spin_lock(&node_lock);
384 rbdc = __rbd_client_find(opt);
385 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600386 /* using an existing client */
387 kref_get(&rbdc->kref);
388 spin_unlock(&node_lock);
389
390 rbd_dev->rbd_client = rbdc;
391
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700392 ceph_destroy_options(opt);
Alex Elder97bb59a2012-01-24 10:08:36 -0600393 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700395 return 0;
396 }
397 spin_unlock(&node_lock);
398
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700399 rbdc = rbd_client_create(opt, rbd_opts);
400 if (IS_ERR(rbdc)) {
401 ret = PTR_ERR(rbdc);
402 goto done_err;
403 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700404
405 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406 return 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700407done_err:
408 kfree(rbd_opts);
409 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410}
411
412/*
413 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600414 *
415 * Caller must hold node_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700416 */
417static void rbd_client_release(struct kref *kref)
418{
419 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
420
421 dout("rbd_release_client %p\n", rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700422 list_del(&rbdc->node);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700423
424 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700425 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700426 kfree(rbdc);
427}
428
429/*
430 * Drop reference to ceph client node. If it's not referenced anymore, release
431 * it.
432 */
433static void rbd_put_client(struct rbd_device *rbd_dev)
434{
Alex Elderd23a4b32012-01-29 13:57:43 -0600435 spin_lock(&node_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
Alex Elderd23a4b32012-01-29 13:57:43 -0600437 spin_unlock(&node_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439}
440
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700441/*
442 * Destroy requests collection
443 */
444static void rbd_coll_release(struct kref *kref)
445{
446 struct rbd_req_coll *coll =
447 container_of(kref, struct rbd_req_coll, kref);
448
449 dout("rbd_coll_release %p\n", coll);
450 kfree(coll);
451}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452
453/*
454 * Create a new header structure, translate header format from the on-disk
455 * header.
456 */
457static int rbd_header_from_disk(struct rbd_image_header *header,
458 struct rbd_image_header_ondisk *ondisk,
459 int allocated_snaps,
460 gfp_t gfp_flags)
461{
462 int i;
463 u32 snap_count = le32_to_cpu(ondisk->snap_count);
464 int ret = -ENOMEM;
465
Alex Elder21079782012-01-24 10:08:36 -0600466 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800467 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800468
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 init_rwsem(&header->snap_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
471 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Alex Elder21079782012-01-24 10:08:36 -0600472 snap_count * sizeof (*ondisk),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700473 gfp_flags);
474 if (!header->snapc)
475 return -ENOMEM;
476 if (snap_count) {
477 header->snap_names = kmalloc(header->snap_names_len,
478 GFP_KERNEL);
479 if (!header->snap_names)
480 goto err_snapc;
481 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
482 GFP_KERNEL);
483 if (!header->snap_sizes)
484 goto err_names;
485 } else {
486 header->snap_names = NULL;
487 header->snap_sizes = NULL;
488 }
489 memcpy(header->block_name, ondisk->block_name,
490 sizeof(ondisk->block_name));
491
492 header->image_size = le64_to_cpu(ondisk->image_size);
493 header->obj_order = ondisk->options.order;
494 header->crypt_type = ondisk->options.crypt_type;
495 header->comp_type = ondisk->options.comp_type;
496
497 atomic_set(&header->snapc->nref, 1);
498 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
499 header->snapc->num_snaps = snap_count;
500 header->total_snaps = snap_count;
501
Alex Elder21079782012-01-24 10:08:36 -0600502 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503 for (i = 0; i < snap_count; i++) {
504 header->snapc->snaps[i] =
505 le64_to_cpu(ondisk->snaps[i].id);
506 header->snap_sizes[i] =
507 le64_to_cpu(ondisk->snaps[i].image_size);
508 }
509
510 /* copy snapshot names */
511 memcpy(header->snap_names, &ondisk->snaps[i],
512 header->snap_names_len);
513 }
514
515 return 0;
516
517err_names:
518 kfree(header->snap_names);
519err_snapc:
520 kfree(header->snapc);
521 return ret;
522}
523
524static int snap_index(struct rbd_image_header *header, int snap_num)
525{
526 return header->total_snaps - snap_num;
527}
528
529static u64 cur_snap_id(struct rbd_device *rbd_dev)
530{
531 struct rbd_image_header *header = &rbd_dev->header;
532
533 if (!rbd_dev->cur_snap)
534 return 0;
535
536 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
537}
538
539static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
540 u64 *seq, u64 *size)
541{
542 int i;
543 char *p = header->snap_names;
544
545 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
546 if (strcmp(snap_name, p) == 0)
547 break;
548 }
549 if (i == header->total_snaps)
550 return -ENOENT;
551 if (seq)
552 *seq = header->snapc->snaps[i];
553
554 if (size)
555 *size = header->snap_sizes[i];
556
557 return i;
558}
559
Josh Durgincc9d7342011-11-21 18:19:13 -0800560static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700561{
562 struct rbd_image_header *header = &dev->header;
563 struct ceph_snap_context *snapc = header->snapc;
564 int ret = -ENOENT;
565
Josh Durgincc9d7342011-11-21 18:19:13 -0800566 BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
567
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700568 down_write(&header->snap_rwsem);
569
Josh Durgincc9d7342011-11-21 18:19:13 -0800570 if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
571 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700572 if (header->total_snaps)
573 snapc->seq = header->snap_seq;
574 else
575 snapc->seq = 0;
576 dev->cur_snap = 0;
577 dev->read_only = 0;
578 if (size)
579 *size = header->image_size;
580 } else {
Josh Durgincc9d7342011-11-21 18:19:13 -0800581 ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700582 if (ret < 0)
583 goto done;
584
585 dev->cur_snap = header->total_snaps - ret;
586 dev->read_only = 1;
587 }
588
589 ret = 0;
590done:
591 up_write(&header->snap_rwsem);
592 return ret;
593}
594
595static void rbd_header_free(struct rbd_image_header *header)
596{
597 kfree(header->snapc);
598 kfree(header->snap_names);
599 kfree(header->snap_sizes);
600}
601
602/*
603 * get the actual striped segment name, offset and length
604 */
605static u64 rbd_get_segment(struct rbd_image_header *header,
606 const char *block_name,
607 u64 ofs, u64 len,
608 char *seg_name, u64 *segofs)
609{
610 u64 seg = ofs >> header->obj_order;
611
612 if (seg_name)
613 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
614 "%s.%012llx", block_name, seg);
615
616 ofs = ofs & ((1 << header->obj_order) - 1);
617 len = min_t(u64, len, (1 << header->obj_order) - ofs);
618
619 if (segofs)
620 *segofs = ofs;
621
622 return len;
623}
624
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700625static int rbd_get_num_segments(struct rbd_image_header *header,
626 u64 ofs, u64 len)
627{
628 u64 start_seg = ofs >> header->obj_order;
629 u64 end_seg = (ofs + len - 1) >> header->obj_order;
630 return end_seg - start_seg + 1;
631}
632
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700633/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700634 * returns the size of an object in the image
635 */
636static u64 rbd_obj_bytes(struct rbd_image_header *header)
637{
638 return 1 << header->obj_order;
639}
640
641/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 * bio helpers
643 */
644
645static void bio_chain_put(struct bio *chain)
646{
647 struct bio *tmp;
648
649 while (chain) {
650 tmp = chain;
651 chain = chain->bi_next;
652 bio_put(tmp);
653 }
654}
655
656/*
657 * zeros a bio chain, starting at specific offset
658 */
659static void zero_bio_chain(struct bio *chain, int start_ofs)
660{
661 struct bio_vec *bv;
662 unsigned long flags;
663 void *buf;
664 int i;
665 int pos = 0;
666
667 while (chain) {
668 bio_for_each_segment(bv, chain, i) {
669 if (pos + bv->bv_len > start_ofs) {
670 int remainder = max(start_ofs - pos, 0);
671 buf = bvec_kmap_irq(bv, &flags);
672 memset(buf + remainder, 0,
673 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200674 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700675 }
676 pos += bv->bv_len;
677 }
678
679 chain = chain->bi_next;
680 }
681}
682
683/*
684 * bio_chain_clone - clone a chain of bios up to a certain length.
685 * might return a bio_pair that will need to be released.
686 */
687static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
688 struct bio_pair **bp,
689 int len, gfp_t gfpmask)
690{
691 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
692 int total = 0;
693
694 if (*bp) {
695 bio_pair_release(*bp);
696 *bp = NULL;
697 }
698
699 while (old_chain && (total < len)) {
700 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
701 if (!tmp)
702 goto err_out;
703
704 if (total + old_chain->bi_size > len) {
705 struct bio_pair *bp;
706
707 /*
708 * this split can only happen with a single paged bio,
709 * split_bio will BUG_ON if this is not the case
710 */
711 dout("bio_chain_clone split! total=%d remaining=%d"
712 "bi_size=%d\n",
713 (int)total, (int)len-total,
714 (int)old_chain->bi_size);
715
716 /* split the bio. We'll release it either in the next
717 call, or it will have to be released outside */
718 bp = bio_split(old_chain, (len - total) / 512ULL);
719 if (!bp)
720 goto err_out;
721
722 __bio_clone(tmp, &bp->bio1);
723
724 *next = &bp->bio2;
725 } else {
726 __bio_clone(tmp, old_chain);
727 *next = old_chain->bi_next;
728 }
729
730 tmp->bi_bdev = NULL;
731 gfpmask &= ~__GFP_WAIT;
732 tmp->bi_next = NULL;
733
734 if (!new_chain) {
735 new_chain = tail = tmp;
736 } else {
737 tail->bi_next = tmp;
738 tail = tmp;
739 }
740 old_chain = old_chain->bi_next;
741
742 total += tmp->bi_size;
743 }
744
745 BUG_ON(total < len);
746
747 if (tail)
748 tail->bi_next = NULL;
749
750 *old = old_chain;
751
752 return new_chain;
753
754err_out:
755 dout("bio_chain_clone with err\n");
756 bio_chain_put(new_chain);
757 return NULL;
758}
759
760/*
761 * helpers for osd request op vectors.
762 */
763static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
764 int num_ops,
765 int opcode,
766 u32 payload_len)
767{
768 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
769 GFP_NOIO);
770 if (!*ops)
771 return -ENOMEM;
772 (*ops)[0].op = opcode;
773 /*
774 * op extent offset and length will be set later on
775 * in calc_raw_layout()
776 */
777 (*ops)[0].payload_len = payload_len;
778 return 0;
779}
780
781static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
782{
783 kfree(ops);
784}
785
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700786static void rbd_coll_end_req_index(struct request *rq,
787 struct rbd_req_coll *coll,
788 int index,
789 int ret, u64 len)
790{
791 struct request_queue *q;
792 int min, max, i;
793
794 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
795 coll, index, ret, len);
796
797 if (!rq)
798 return;
799
800 if (!coll) {
801 blk_end_request(rq, ret, len);
802 return;
803 }
804
805 q = rq->q;
806
807 spin_lock_irq(q->queue_lock);
808 coll->status[index].done = 1;
809 coll->status[index].rc = ret;
810 coll->status[index].bytes = len;
811 max = min = coll->num_done;
812 while (max < coll->total && coll->status[max].done)
813 max++;
814
815 for (i = min; i<max; i++) {
816 __blk_end_request(rq, coll->status[i].rc,
817 coll->status[i].bytes);
818 coll->num_done++;
819 kref_put(&coll->kref, rbd_coll_release);
820 }
821 spin_unlock_irq(q->queue_lock);
822}
823
824static void rbd_coll_end_req(struct rbd_request *req,
825 int ret, u64 len)
826{
827 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
828}
829
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700830/*
831 * Send ceph osd request
832 */
833static int rbd_do_request(struct request *rq,
834 struct rbd_device *dev,
835 struct ceph_snap_context *snapc,
836 u64 snapid,
837 const char *obj, u64 ofs, u64 len,
838 struct bio *bio,
839 struct page **pages,
840 int num_pages,
841 int flags,
842 struct ceph_osd_req_op *ops,
843 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700844 struct rbd_req_coll *coll,
845 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700847 struct ceph_msg *msg),
848 struct ceph_osd_request **linger_req,
849 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700850{
851 struct ceph_osd_request *req;
852 struct ceph_file_layout *layout;
853 int ret;
854 u64 bno;
855 struct timespec mtime = CURRENT_TIME;
856 struct rbd_request *req_data;
857 struct ceph_osd_request_head *reqhead;
858 struct rbd_image_header *header = &dev->header;
Alex Elder1dbb4392012-01-24 10:08:37 -0600859 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700860
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700861 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700862 if (!req_data) {
863 if (coll)
864 rbd_coll_end_req_index(rq, coll, coll_index,
865 -ENOMEM, len);
866 return -ENOMEM;
867 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700868
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700869 if (coll) {
870 req_data->coll = coll;
871 req_data->coll_index = coll_index;
872 }
873
874 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700875
876 down_read(&header->snap_rwsem);
877
Alex Elder1dbb4392012-01-24 10:08:37 -0600878 osdc = &dev->rbd_client->client->osdc;
879 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
880 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700881 if (!req) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882 up_read(&header->snap_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700883 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 goto done_pages;
885 }
886
887 req->r_callback = rbd_cb;
888
889 req_data->rq = rq;
890 req_data->bio = bio;
891 req_data->pages = pages;
892 req_data->len = len;
893
894 req->r_priv = req_data;
895
896 reqhead = req->r_request->front.iov_base;
897 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
898
899 strncpy(req->r_oid, obj, sizeof(req->r_oid));
900 req->r_oid_len = strlen(req->r_oid);
901
902 layout = &req->r_file_layout;
903 memset(layout, 0, sizeof(*layout));
904 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
905 layout->fl_stripe_count = cpu_to_le32(1);
906 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
907 layout->fl_pg_preferred = cpu_to_le32(-1);
908 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
Alex Elder1dbb4392012-01-24 10:08:37 -0600909 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
910 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911
912 ceph_osdc_build_request(req, ofs, &len,
913 ops,
914 snapc,
915 &mtime,
916 req->r_oid, req->r_oid_len);
917 up_read(&header->snap_rwsem);
918
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700919 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600920 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700921 *linger_req = req;
922 }
923
Alex Elder1dbb4392012-01-24 10:08:37 -0600924 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925 if (ret < 0)
926 goto done_err;
927
928 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600929 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700930 if (ver)
931 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700932 dout("reassert_ver=%lld\n",
933 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700934 ceph_osdc_put_request(req);
935 }
936 return ret;
937
938done_err:
939 bio_chain_put(req_data->bio);
940 ceph_osdc_put_request(req);
941done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700942 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700943 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700944 return ret;
945}
946
947/*
948 * Ceph osd op callback
949 */
950static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
951{
952 struct rbd_request *req_data = req->r_priv;
953 struct ceph_osd_reply_head *replyhead;
954 struct ceph_osd_op *op;
955 __s32 rc;
956 u64 bytes;
957 int read_op;
958
959 /* parse reply */
960 replyhead = msg->front.iov_base;
961 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
962 op = (void *)(replyhead + 1);
963 rc = le32_to_cpu(replyhead->result);
964 bytes = le64_to_cpu(op->extent.length);
965 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
966
967 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
968
969 if (rc == -ENOENT && read_op) {
970 zero_bio_chain(req_data->bio, 0);
971 rc = 0;
972 } else if (rc == 0 && read_op && bytes < req_data->len) {
973 zero_bio_chain(req_data->bio, bytes);
974 bytes = req_data->len;
975 }
976
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700977 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700978
979 if (req_data->bio)
980 bio_chain_put(req_data->bio);
981
982 ceph_osdc_put_request(req);
983 kfree(req_data);
984}
985
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700986static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
987{
988 ceph_osdc_put_request(req);
989}
990
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700991/*
992 * Do a synchronous ceph osd operation
993 */
994static int rbd_req_sync_op(struct rbd_device *dev,
995 struct ceph_snap_context *snapc,
996 u64 snapid,
997 int opcode,
998 int flags,
999 struct ceph_osd_req_op *orig_ops,
1000 int num_reply,
1001 const char *obj,
1002 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001003 char *buf,
1004 struct ceph_osd_request **linger_req,
1005 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001006{
1007 int ret;
1008 struct page **pages;
1009 int num_pages;
1010 struct ceph_osd_req_op *ops = orig_ops;
1011 u32 payload_len;
1012
1013 num_pages = calc_pages_for(ofs , len);
1014 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001015 if (IS_ERR(pages))
1016 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017
1018 if (!orig_ops) {
1019 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1020 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1021 if (ret < 0)
1022 goto done;
1023
1024 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1025 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1026 if (ret < 0)
1027 goto done_ops;
1028 }
1029 }
1030
1031 ret = rbd_do_request(NULL, dev, snapc, snapid,
1032 obj, ofs, len, NULL,
1033 pages, num_pages,
1034 flags,
1035 ops,
1036 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001037 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001038 NULL,
1039 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040 if (ret < 0)
1041 goto done_ops;
1042
1043 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1044 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1045
1046done_ops:
1047 if (!orig_ops)
1048 rbd_destroy_ops(ops);
1049done:
1050 ceph_release_page_vector(pages, num_pages);
1051 return ret;
1052}
1053
1054/*
1055 * Do an asynchronous ceph osd operation
1056 */
1057static int rbd_do_op(struct request *rq,
1058 struct rbd_device *rbd_dev ,
1059 struct ceph_snap_context *snapc,
1060 u64 snapid,
1061 int opcode, int flags, int num_reply,
1062 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001063 struct bio *bio,
1064 struct rbd_req_coll *coll,
1065 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001066{
1067 char *seg_name;
1068 u64 seg_ofs;
1069 u64 seg_len;
1070 int ret;
1071 struct ceph_osd_req_op *ops;
1072 u32 payload_len;
1073
1074 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1075 if (!seg_name)
1076 return -ENOMEM;
1077
1078 seg_len = rbd_get_segment(&rbd_dev->header,
1079 rbd_dev->header.block_name,
1080 ofs, len,
1081 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001082
1083 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1084
1085 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1086 if (ret < 0)
1087 goto done;
1088
1089 /* we've taken care of segment sizes earlier when we
1090 cloned the bios. We should never have a segment
1091 truncated at this point */
1092 BUG_ON(seg_len < len);
1093
1094 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1095 seg_name, seg_ofs, seg_len,
1096 bio,
1097 NULL, 0,
1098 flags,
1099 ops,
1100 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001101 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001102 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001103
1104 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001105done:
1106 kfree(seg_name);
1107 return ret;
1108}
1109
1110/*
1111 * Request async osd write
1112 */
1113static int rbd_req_write(struct request *rq,
1114 struct rbd_device *rbd_dev,
1115 struct ceph_snap_context *snapc,
1116 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001117 struct bio *bio,
1118 struct rbd_req_coll *coll,
1119 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001120{
1121 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1122 CEPH_OSD_OP_WRITE,
1123 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1124 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001125 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001126}
1127
1128/*
1129 * Request async osd read
1130 */
1131static int rbd_req_read(struct request *rq,
1132 struct rbd_device *rbd_dev,
1133 u64 snapid,
1134 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001135 struct bio *bio,
1136 struct rbd_req_coll *coll,
1137 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001138{
1139 return rbd_do_op(rq, rbd_dev, NULL,
1140 (snapid ? snapid : CEPH_NOSNAP),
1141 CEPH_OSD_OP_READ,
1142 CEPH_OSD_FLAG_READ,
1143 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001144 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001145}
1146
1147/*
1148 * Request sync osd read
1149 */
1150static int rbd_req_sync_read(struct rbd_device *dev,
1151 struct ceph_snap_context *snapc,
1152 u64 snapid,
1153 const char *obj,
1154 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001155 char *buf,
1156 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001157{
1158 return rbd_req_sync_op(dev, NULL,
1159 (snapid ? snapid : CEPH_NOSNAP),
1160 CEPH_OSD_OP_READ,
1161 CEPH_OSD_FLAG_READ,
1162 NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001163 1, obj, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164}
1165
1166/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001167 * Request sync osd watch
1168 */
1169static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1170 u64 ver,
1171 u64 notify_id,
1172 const char *obj)
1173{
1174 struct ceph_osd_req_op *ops;
1175 struct page **pages = NULL;
Sage Weil11f77002011-05-12 16:13:54 -07001176 int ret;
1177
1178 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001179 if (ret < 0)
1180 return ret;
1181
1182 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1183 ops[0].watch.cookie = notify_id;
1184 ops[0].watch.flag = 0;
1185
1186 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1187 obj, 0, 0, NULL,
1188 pages, 0,
1189 CEPH_OSD_FLAG_READ,
1190 ops,
1191 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001192 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001193 rbd_simple_req_cb, 0, NULL);
1194
1195 rbd_destroy_ops(ops);
1196 return ret;
1197}
1198
1199static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1200{
1201 struct rbd_device *dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001202 int rc;
1203
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001204 if (!dev)
1205 return;
1206
1207 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1208 notify_id, (int)opcode);
1209 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Sage Weil13143d22011-05-12 16:08:30 -07001210 rc = __rbd_update_snaps(dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001211 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001212 if (rc)
1213 pr_warning(DRV_NAME "%d got notification but failed to update"
1214 " snaps: %d\n", dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001215
1216 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1217}
1218
1219/*
1220 * Request sync osd watch
1221 */
1222static int rbd_req_sync_watch(struct rbd_device *dev,
1223 const char *obj,
1224 u64 ver)
1225{
1226 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001227 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001228
1229 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1230 if (ret < 0)
1231 return ret;
1232
1233 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1234 (void *)dev, &dev->watch_event);
1235 if (ret < 0)
1236 goto fail;
1237
1238 ops[0].watch.ver = cpu_to_le64(ver);
1239 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1240 ops[0].watch.flag = 1;
1241
1242 ret = rbd_req_sync_op(dev, NULL,
1243 CEPH_NOSNAP,
1244 0,
1245 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1246 ops,
1247 1, obj, 0, 0, NULL,
1248 &dev->watch_request, NULL);
1249
1250 if (ret < 0)
1251 goto fail_event;
1252
1253 rbd_destroy_ops(ops);
1254 return 0;
1255
1256fail_event:
1257 ceph_osdc_cancel_event(dev->watch_event);
1258 dev->watch_event = NULL;
1259fail:
1260 rbd_destroy_ops(ops);
1261 return ret;
1262}
1263
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001264/*
1265 * Request sync osd unwatch
1266 */
1267static int rbd_req_sync_unwatch(struct rbd_device *dev,
1268 const char *obj)
1269{
1270 struct ceph_osd_req_op *ops;
1271
1272 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1273 if (ret < 0)
1274 return ret;
1275
1276 ops[0].watch.ver = 0;
1277 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1278 ops[0].watch.flag = 0;
1279
1280 ret = rbd_req_sync_op(dev, NULL,
1281 CEPH_NOSNAP,
1282 0,
1283 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1284 ops,
1285 1, obj, 0, 0, NULL, NULL, NULL);
1286
1287 rbd_destroy_ops(ops);
1288 ceph_osdc_cancel_event(dev->watch_event);
1289 dev->watch_event = NULL;
1290 return ret;
1291}
1292
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001293struct rbd_notify_info {
1294 struct rbd_device *dev;
1295};
1296
1297static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1298{
1299 struct rbd_device *dev = (struct rbd_device *)data;
1300 if (!dev)
1301 return;
1302
1303 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1304 notify_id, (int)opcode);
1305}
1306
1307/*
1308 * Request sync osd notify
1309 */
1310static int rbd_req_sync_notify(struct rbd_device *dev,
1311 const char *obj)
1312{
1313 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001314 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001315 struct ceph_osd_event *event;
1316 struct rbd_notify_info info;
1317 int payload_len = sizeof(u32) + sizeof(u32);
1318 int ret;
1319
1320 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1321 if (ret < 0)
1322 return ret;
1323
1324 info.dev = dev;
1325
1326 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1327 (void *)&info, &event);
1328 if (ret < 0)
1329 goto fail;
1330
1331 ops[0].watch.ver = 1;
1332 ops[0].watch.flag = 1;
1333 ops[0].watch.cookie = event->cookie;
1334 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1335 ops[0].watch.timeout = 12;
1336
1337 ret = rbd_req_sync_op(dev, NULL,
1338 CEPH_NOSNAP,
1339 0,
1340 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1341 ops,
1342 1, obj, 0, 0, NULL, NULL, NULL);
1343 if (ret < 0)
1344 goto fail_event;
1345
1346 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1347 dout("ceph_osdc_wait_event returned %d\n", ret);
1348 rbd_destroy_ops(ops);
1349 return 0;
1350
1351fail_event:
1352 ceph_osdc_cancel_event(event);
1353fail:
1354 rbd_destroy_ops(ops);
1355 return ret;
1356}
1357
1358/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001359 * Request sync osd read
1360 */
1361static int rbd_req_sync_exec(struct rbd_device *dev,
1362 const char *obj,
1363 const char *cls,
1364 const char *method,
1365 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001366 int len,
1367 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001368{
1369 struct ceph_osd_req_op *ops;
1370 int cls_len = strlen(cls);
1371 int method_len = strlen(method);
1372 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1373 cls_len + method_len + len);
1374 if (ret < 0)
1375 return ret;
1376
1377 ops[0].cls.class_name = cls;
1378 ops[0].cls.class_len = (__u8)cls_len;
1379 ops[0].cls.method_name = method;
1380 ops[0].cls.method_len = (__u8)method_len;
1381 ops[0].cls.argc = 0;
1382 ops[0].cls.indata = data;
1383 ops[0].cls.indata_len = len;
1384
1385 ret = rbd_req_sync_op(dev, NULL,
1386 CEPH_NOSNAP,
1387 0,
1388 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1389 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001390 1, obj, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001391
1392 rbd_destroy_ops(ops);
1393
1394 dout("cls_exec returned %d\n", ret);
1395 return ret;
1396}
1397
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001398static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1399{
1400 struct rbd_req_coll *coll =
1401 kzalloc(sizeof(struct rbd_req_coll) +
1402 sizeof(struct rbd_req_status) * num_reqs,
1403 GFP_ATOMIC);
1404
1405 if (!coll)
1406 return NULL;
1407 coll->total = num_reqs;
1408 kref_init(&coll->kref);
1409 return coll;
1410}
1411
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001412/*
1413 * block device queue callback
1414 */
1415static void rbd_rq_fn(struct request_queue *q)
1416{
1417 struct rbd_device *rbd_dev = q->queuedata;
1418 struct request *rq;
1419 struct bio_pair *bp = NULL;
1420
1421 rq = blk_fetch_request(q);
1422
1423 while (1) {
1424 struct bio *bio;
1425 struct bio *rq_bio, *next_bio = NULL;
1426 bool do_write;
1427 int size, op_size = 0;
1428 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001429 int num_segs, cur_seg = 0;
1430 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001431
1432 /* peek at request from block layer */
1433 if (!rq)
1434 break;
1435
1436 dout("fetched request\n");
1437
1438 /* filter out block requests we don't understand */
1439 if ((rq->cmd_type != REQ_TYPE_FS)) {
1440 __blk_end_request_all(rq, 0);
1441 goto next;
1442 }
1443
1444 /* deduce our operation (read, write) */
1445 do_write = (rq_data_dir(rq) == WRITE);
1446
1447 size = blk_rq_bytes(rq);
1448 ofs = blk_rq_pos(rq) * 512ULL;
1449 rq_bio = rq->bio;
1450 if (do_write && rbd_dev->read_only) {
1451 __blk_end_request_all(rq, -EROFS);
1452 goto next;
1453 }
1454
1455 spin_unlock_irq(q->queue_lock);
1456
1457 dout("%s 0x%x bytes at 0x%llx\n",
1458 do_write ? "write" : "read",
1459 size, blk_rq_pos(rq) * 512ULL);
1460
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001461 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1462 coll = rbd_alloc_coll(num_segs);
1463 if (!coll) {
1464 spin_lock_irq(q->queue_lock);
1465 __blk_end_request_all(rq, -ENOMEM);
1466 goto next;
1467 }
1468
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001469 do {
1470 /* a bio clone to be passed down to OSD req */
1471 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1472 op_size = rbd_get_segment(&rbd_dev->header,
1473 rbd_dev->header.block_name,
1474 ofs, size,
1475 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001476 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001477 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1478 op_size, GFP_ATOMIC);
1479 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001480 rbd_coll_end_req_index(rq, coll, cur_seg,
1481 -ENOMEM, op_size);
1482 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001483 }
1484
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001485
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001486 /* init OSD command: write or read */
1487 if (do_write)
1488 rbd_req_write(rq, rbd_dev,
1489 rbd_dev->header.snapc,
1490 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001491 op_size, bio,
1492 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001493 else
1494 rbd_req_read(rq, rbd_dev,
1495 cur_snap_id(rbd_dev),
1496 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001497 op_size, bio,
1498 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001499
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001500next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001501 size -= op_size;
1502 ofs += op_size;
1503
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001504 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001505 rq_bio = next_bio;
1506 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001507 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001508
1509 if (bp)
1510 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511 spin_lock_irq(q->queue_lock);
1512next:
1513 rq = blk_fetch_request(q);
1514 }
1515}
1516
1517/*
1518 * a queue callback. Makes sure that we don't create a bio that spans across
1519 * multiple osd objects. One exception would be with a single page bios,
1520 * which we handle later at bio_chain_clone
1521 */
1522static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1523 struct bio_vec *bvec)
1524{
1525 struct rbd_device *rbd_dev = q->queuedata;
1526 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1527 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1528 unsigned int bio_sectors = bmd->bi_size >> 9;
1529 int max;
1530
1531 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1532 + bio_sectors)) << 9;
1533 if (max < 0)
1534 max = 0; /* bio_add cannot handle a negative return */
1535 if (max <= bvec->bv_len && bio_sectors == 0)
1536 return bvec->bv_len;
1537 return max;
1538}
1539
1540static void rbd_free_disk(struct rbd_device *rbd_dev)
1541{
1542 struct gendisk *disk = rbd_dev->disk;
1543
1544 if (!disk)
1545 return;
1546
1547 rbd_header_free(&rbd_dev->header);
1548
1549 if (disk->flags & GENHD_FL_UP)
1550 del_gendisk(disk);
1551 if (disk->queue)
1552 blk_cleanup_queue(disk->queue);
1553 put_disk(disk);
1554}
1555
1556/*
1557 * reload the ondisk the header
1558 */
1559static int rbd_read_header(struct rbd_device *rbd_dev,
1560 struct rbd_image_header *header)
1561{
1562 ssize_t rc;
1563 struct rbd_image_header_ondisk *dh;
1564 int snap_count = 0;
1565 u64 snap_names_len = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001566 u64 ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567
1568 while (1) {
1569 int len = sizeof(*dh) +
1570 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1571 snap_names_len;
1572
1573 rc = -ENOMEM;
1574 dh = kmalloc(len, GFP_KERNEL);
1575 if (!dh)
1576 return -ENOMEM;
1577
1578 rc = rbd_req_sync_read(rbd_dev,
1579 NULL, CEPH_NOSNAP,
1580 rbd_dev->obj_md_name,
1581 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001582 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001583 if (rc < 0)
1584 goto out_dh;
1585
1586 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001587 if (rc < 0) {
1588 if (rc == -ENXIO) {
1589 pr_warning("unrecognized header format"
1590 " for image %s", rbd_dev->obj);
1591 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001592 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001593 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001594
1595 if (snap_count != header->total_snaps) {
1596 snap_count = header->total_snaps;
1597 snap_names_len = header->snap_names_len;
1598 rbd_header_free(header);
1599 kfree(dh);
1600 continue;
1601 }
1602 break;
1603 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001604 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001605
1606out_dh:
1607 kfree(dh);
1608 return rc;
1609}
1610
1611/*
1612 * create a snapshot
1613 */
1614static int rbd_header_add_snap(struct rbd_device *dev,
1615 const char *snap_name,
1616 gfp_t gfp_flags)
1617{
1618 int name_len = strlen(snap_name);
1619 u64 new_snapid;
1620 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001621 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001622 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001623 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001624
1625 /* we should create a snapshot only if we're pointing at the head */
1626 if (dev->cur_snap)
1627 return -EINVAL;
1628
Alex Elder1dbb4392012-01-24 10:08:37 -06001629 monc = &dev->rbd_client->client->monc;
1630 ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001631 dout("created snapid=%lld\n", new_snapid);
1632 if (ret < 0)
1633 return ret;
1634
1635 data = kmalloc(name_len + 16, gfp_flags);
1636 if (!data)
1637 return -ENOMEM;
1638
Sage Weil916d4d62011-05-12 16:10:50 -07001639 p = data;
1640 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001641
Sage Weil916d4d62011-05-12 16:10:50 -07001642 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1643 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001644
1645 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001646 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001647
Sage Weil916d4d62011-05-12 16:10:50 -07001648 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001649
1650 if (ret < 0)
1651 return ret;
1652
1653 dev->header.snapc->seq = new_snapid;
1654
1655 return 0;
1656bad:
1657 return -ERANGE;
1658}
1659
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001660static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1661{
1662 struct rbd_snap *snap;
1663
1664 while (!list_empty(&rbd_dev->snaps)) {
1665 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1666 __rbd_remove_snap_dev(rbd_dev, snap);
1667 }
1668}
1669
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001670/*
1671 * only read the first part of the ondisk header, without the snaps info
1672 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001673static int __rbd_update_snaps(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001674{
1675 int ret;
1676 struct rbd_image_header h;
1677 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001678 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001679
1680 ret = rbd_read_header(rbd_dev, &h);
1681 if (ret < 0)
1682 return ret;
1683
Sage Weil9db4b3e2011-04-19 22:49:06 -07001684 /* resized? */
1685 set_capacity(rbd_dev->disk, h.image_size / 512ULL);
1686
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001687 down_write(&rbd_dev->header.snap_rwsem);
1688
1689 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001690 if (rbd_dev->header.total_snaps &&
1691 rbd_dev->header.snapc->snaps[0] == snap_seq)
1692 /* pointing at the head, will need to follow that
1693 if head moves */
1694 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001695
1696 kfree(rbd_dev->header.snapc);
1697 kfree(rbd_dev->header.snap_names);
1698 kfree(rbd_dev->header.snap_sizes);
1699
1700 rbd_dev->header.total_snaps = h.total_snaps;
1701 rbd_dev->header.snapc = h.snapc;
1702 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001703 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001704 rbd_dev->header.snap_sizes = h.snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001705 if (follow_seq)
1706 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1707 else
1708 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001709
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001710 ret = __rbd_init_snaps_header(rbd_dev);
1711
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001712 up_write(&rbd_dev->header.snap_rwsem);
1713
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001714 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001715}
1716
1717static int rbd_init_disk(struct rbd_device *rbd_dev)
1718{
1719 struct gendisk *disk;
1720 struct request_queue *q;
1721 int rc;
1722 u64 total_size = 0;
1723
1724 /* contact OSD, request size info about the object being mapped */
1725 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1726 if (rc)
1727 return rc;
1728
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001729 /* no need to lock here, as rbd_dev is not registered yet */
1730 rc = __rbd_init_snaps_header(rbd_dev);
1731 if (rc)
1732 return rc;
1733
Josh Durgincc9d7342011-11-21 18:19:13 -08001734 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001735 if (rc)
1736 return rc;
1737
1738 /* create gendisk info */
1739 rc = -ENOMEM;
1740 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1741 if (!disk)
1742 goto out;
1743
Sage Weilaedfec52011-05-12 20:57:03 -07001744 snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
1745 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746 disk->major = rbd_dev->major;
1747 disk->first_minor = 0;
1748 disk->fops = &rbd_bd_ops;
1749 disk->private_data = rbd_dev;
1750
1751 /* init rq */
1752 rc = -ENOMEM;
1753 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1754 if (!q)
1755 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001756
1757 /* set io sizes to object size */
1758 blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
1759 blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
1760 blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
1761 blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
1762
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001763 blk_queue_merge_bvec(q, rbd_merge_bvec);
1764 disk->queue = q;
1765
1766 q->queuedata = rbd_dev;
1767
1768 rbd_dev->disk = disk;
1769 rbd_dev->q = q;
1770
1771 /* finally, announce the disk to the world */
1772 set_capacity(disk, total_size / 512ULL);
1773 add_disk(disk);
1774
1775 pr_info("%s: added with size 0x%llx\n",
1776 disk->disk_name, (unsigned long long)total_size);
1777 return 0;
1778
1779out_disk:
1780 put_disk(disk);
1781out:
1782 return rc;
1783}
1784
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001785/*
1786 sysfs
1787*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001788
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001789static ssize_t rbd_size_show(struct device *dev,
1790 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001791{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001792 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1793
1794 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001795}
1796
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001797static ssize_t rbd_major_show(struct device *dev,
1798 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001800 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1801
1802 return sprintf(buf, "%d\n", rbd_dev->major);
1803}
1804
1805static ssize_t rbd_client_id_show(struct device *dev,
1806 struct device_attribute *attr, char *buf)
1807{
1808 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1809
Alex Elder1dbb4392012-01-24 10:08:37 -06001810 return sprintf(buf, "client%lld\n",
1811 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001812}
1813
1814static ssize_t rbd_pool_show(struct device *dev,
1815 struct device_attribute *attr, char *buf)
1816{
1817 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1818
1819 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1820}
1821
1822static ssize_t rbd_name_show(struct device *dev,
1823 struct device_attribute *attr, char *buf)
1824{
1825 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1826
1827 return sprintf(buf, "%s\n", rbd_dev->obj);
1828}
1829
1830static ssize_t rbd_snap_show(struct device *dev,
1831 struct device_attribute *attr,
1832 char *buf)
1833{
1834 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1835
1836 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1837}
1838
1839static ssize_t rbd_image_refresh(struct device *dev,
1840 struct device_attribute *attr,
1841 const char *buf,
1842 size_t size)
1843{
1844 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1845 int rc;
1846 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001847
1848 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1849
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001850 rc = __rbd_update_snaps(rbd_dev);
1851 if (rc < 0)
1852 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001853
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001854 mutex_unlock(&ctl_mutex);
1855 return ret;
1856}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001857
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001858static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1859static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1860static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1861static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1862static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1863static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1864static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1865static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001866
1867static struct attribute *rbd_attrs[] = {
1868 &dev_attr_size.attr,
1869 &dev_attr_major.attr,
1870 &dev_attr_client_id.attr,
1871 &dev_attr_pool.attr,
1872 &dev_attr_name.attr,
1873 &dev_attr_current_snap.attr,
1874 &dev_attr_refresh.attr,
1875 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001876 NULL
1877};
1878
1879static struct attribute_group rbd_attr_group = {
1880 .attrs = rbd_attrs,
1881};
1882
1883static const struct attribute_group *rbd_attr_groups[] = {
1884 &rbd_attr_group,
1885 NULL
1886};
1887
1888static void rbd_sysfs_dev_release(struct device *dev)
1889{
1890}
1891
1892static struct device_type rbd_device_type = {
1893 .name = "rbd",
1894 .groups = rbd_attr_groups,
1895 .release = rbd_sysfs_dev_release,
1896};
1897
1898
1899/*
1900 sysfs - snapshots
1901*/
1902
1903static ssize_t rbd_snap_size_show(struct device *dev,
1904 struct device_attribute *attr,
1905 char *buf)
1906{
1907 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1908
1909 return sprintf(buf, "%lld\n", (long long)snap->size);
1910}
1911
1912static ssize_t rbd_snap_id_show(struct device *dev,
1913 struct device_attribute *attr,
1914 char *buf)
1915{
1916 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1917
1918 return sprintf(buf, "%lld\n", (long long)snap->id);
1919}
1920
1921static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1922static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1923
1924static struct attribute *rbd_snap_attrs[] = {
1925 &dev_attr_snap_size.attr,
1926 &dev_attr_snap_id.attr,
1927 NULL,
1928};
1929
1930static struct attribute_group rbd_snap_attr_group = {
1931 .attrs = rbd_snap_attrs,
1932};
1933
1934static void rbd_snap_dev_release(struct device *dev)
1935{
1936 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1937 kfree(snap->name);
1938 kfree(snap);
1939}
1940
1941static const struct attribute_group *rbd_snap_attr_groups[] = {
1942 &rbd_snap_attr_group,
1943 NULL
1944};
1945
1946static struct device_type rbd_snap_device_type = {
1947 .groups = rbd_snap_attr_groups,
1948 .release = rbd_snap_dev_release,
1949};
1950
1951static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1952 struct rbd_snap *snap)
1953{
1954 list_del(&snap->node);
1955 device_unregister(&snap->dev);
1956}
1957
1958static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1959 struct rbd_snap *snap,
1960 struct device *parent)
1961{
1962 struct device *dev = &snap->dev;
1963 int ret;
1964
1965 dev->type = &rbd_snap_device_type;
1966 dev->parent = parent;
1967 dev->release = rbd_snap_dev_release;
1968 dev_set_name(dev, "snap_%s", snap->name);
1969 ret = device_register(dev);
1970
1971 return ret;
1972}
1973
1974static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
1975 int i, const char *name,
1976 struct rbd_snap **snapp)
1977{
1978 int ret;
1979 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
1980 if (!snap)
1981 return -ENOMEM;
1982 snap->name = kstrdup(name, GFP_KERNEL);
1983 snap->size = rbd_dev->header.snap_sizes[i];
1984 snap->id = rbd_dev->header.snapc->snaps[i];
1985 if (device_is_registered(&rbd_dev->dev)) {
1986 ret = rbd_register_snap_dev(rbd_dev, snap,
1987 &rbd_dev->dev);
1988 if (ret < 0)
1989 goto err;
1990 }
1991 *snapp = snap;
1992 return 0;
1993err:
1994 kfree(snap->name);
1995 kfree(snap);
1996 return ret;
1997}
1998
1999/*
2000 * search for the previous snap in a null delimited string list
2001 */
2002const char *rbd_prev_snap_name(const char *name, const char *start)
2003{
2004 if (name < start + 2)
2005 return NULL;
2006
2007 name -= 2;
2008 while (*name) {
2009 if (name == start)
2010 return start;
2011 name--;
2012 }
2013 return name + 1;
2014}
2015
2016/*
2017 * compare the old list of snapshots that we have to what's in the header
2018 * and update it accordingly. Note that the header holds the snapshots
2019 * in a reverse order (from newest to oldest) and we need to go from
2020 * older to new so that we don't get a duplicate snap name when
2021 * doing the process (e.g., removed snapshot and recreated a new
2022 * one with the same name.
2023 */
2024static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2025{
2026 const char *name, *first_name;
2027 int i = rbd_dev->header.total_snaps;
2028 struct rbd_snap *snap, *old_snap = NULL;
2029 int ret;
2030 struct list_head *p, *n;
2031
2032 first_name = rbd_dev->header.snap_names;
2033 name = first_name + rbd_dev->header.snap_names_len;
2034
2035 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2036 u64 cur_id;
2037
2038 old_snap = list_entry(p, struct rbd_snap, node);
2039
2040 if (i)
2041 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2042
2043 if (!i || old_snap->id < cur_id) {
2044 /* old_snap->id was skipped, thus was removed */
2045 __rbd_remove_snap_dev(rbd_dev, old_snap);
2046 continue;
2047 }
2048 if (old_snap->id == cur_id) {
2049 /* we have this snapshot already */
2050 i--;
2051 name = rbd_prev_snap_name(name, first_name);
2052 continue;
2053 }
2054 for (; i > 0;
2055 i--, name = rbd_prev_snap_name(name, first_name)) {
2056 if (!name) {
2057 WARN_ON(1);
2058 return -EINVAL;
2059 }
2060 cur_id = rbd_dev->header.snapc->snaps[i];
2061 /* snapshot removal? handle it above */
2062 if (cur_id >= old_snap->id)
2063 break;
2064 /* a new snapshot */
2065 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2066 if (ret < 0)
2067 return ret;
2068
2069 /* note that we add it backward so using n and not p */
2070 list_add(&snap->node, n);
2071 p = &snap->node;
2072 }
2073 }
2074 /* we're done going over the old snap list, just add what's left */
2075 for (; i > 0; i--) {
2076 name = rbd_prev_snap_name(name, first_name);
2077 if (!name) {
2078 WARN_ON(1);
2079 return -EINVAL;
2080 }
2081 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2082 if (ret < 0)
2083 return ret;
2084 list_add(&snap->node, &rbd_dev->snaps);
2085 }
2086
2087 return 0;
2088}
2089
2090
2091static void rbd_root_dev_release(struct device *dev)
2092{
2093}
2094
2095static struct device rbd_root_dev = {
2096 .init_name = "rbd",
2097 .release = rbd_root_dev_release,
2098};
2099
2100static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2101{
2102 int ret = -ENOMEM;
2103 struct device *dev;
2104 struct rbd_snap *snap;
2105
2106 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2107 dev = &rbd_dev->dev;
2108
2109 dev->bus = &rbd_bus_type;
2110 dev->type = &rbd_device_type;
2111 dev->parent = &rbd_root_dev;
2112 dev->release = rbd_dev_release;
2113 dev_set_name(dev, "%d", rbd_dev->id);
2114 ret = device_register(dev);
2115 if (ret < 0)
2116 goto done_free;
2117
2118 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2119 ret = rbd_register_snap_dev(rbd_dev, snap,
2120 &rbd_dev->dev);
2121 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002122 break;
2123 }
2124
2125 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002126 return 0;
2127done_free:
2128 mutex_unlock(&ctl_mutex);
2129 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002130}
2131
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002132static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2133{
2134 device_unregister(&rbd_dev->dev);
2135}
2136
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002137static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2138{
2139 int ret, rc;
2140
2141 do {
2142 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2143 rbd_dev->header.obj_version);
2144 if (ret == -ERANGE) {
2145 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2146 rc = __rbd_update_snaps(rbd_dev);
2147 mutex_unlock(&ctl_mutex);
2148 if (rc < 0)
2149 return rc;
2150 }
2151 } while (ret == -ERANGE);
2152
2153 return ret;
2154}
2155
Alex Elder1ddbe942012-01-29 13:57:44 -06002156static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2157
2158/*
Alex Elder499afd52012-02-02 08:13:29 -06002159 * Get a unique rbd identifier for the given new rbd_dev, and add
2160 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002161 */
Alex Elder499afd52012-02-02 08:13:29 -06002162static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002163{
Alex Elder499afd52012-02-02 08:13:29 -06002164 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2165
2166 spin_lock(&rbd_dev_list_lock);
2167 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2168 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002169}
Alex Elderb7f23c32012-01-29 13:57:43 -06002170
Alex Elder1ddbe942012-01-29 13:57:44 -06002171/*
Alex Elder499afd52012-02-02 08:13:29 -06002172 * Remove an rbd_dev from the global list, and record that its
2173 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002174 */
Alex Elder499afd52012-02-02 08:13:29 -06002175static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002176{
Alex Elderd184f6b2012-01-29 13:57:44 -06002177 struct list_head *tmp;
2178 int rbd_id = rbd_dev->id;
2179 int max_id;
2180
2181 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002182
2183 spin_lock(&rbd_dev_list_lock);
2184 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002185
2186 /*
2187 * If the id being "put" is not the current maximum, there
2188 * is nothing special we need to do.
2189 */
2190 if (rbd_id != atomic64_read(&rbd_id_max)) {
2191 spin_unlock(&rbd_dev_list_lock);
2192 return;
2193 }
2194
2195 /*
2196 * We need to update the current maximum id. Search the
2197 * list to find out what it is. We're more likely to find
2198 * the maximum at the end, so search the list backward.
2199 */
2200 max_id = 0;
2201 list_for_each_prev(tmp, &rbd_dev_list) {
2202 struct rbd_device *rbd_dev;
2203
2204 rbd_dev = list_entry(tmp, struct rbd_device, node);
2205 if (rbd_id > max_id)
2206 max_id = rbd_id;
2207 }
Alex Elder499afd52012-02-02 08:13:29 -06002208 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002209
Alex Elder1ddbe942012-01-29 13:57:44 -06002210 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002211 * The max id could have been updated by rbd_id_get(), in
2212 * which case it now accurately reflects the new maximum.
2213 * Be careful not to overwrite the maximum value in that
2214 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002215 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002216 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002217}
2218
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002219static ssize_t rbd_add(struct bus_type *bus,
2220 const char *buf,
2221 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002222{
2223 struct ceph_osd_client *osdc;
2224 struct rbd_device *rbd_dev;
2225 ssize_t rc = -ENOMEM;
Alex Elderb7f23c32012-01-29 13:57:43 -06002226 int irc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002227 char *mon_dev_name;
2228 char *options;
2229
2230 if (!try_module_get(THIS_MODULE))
2231 return -ENODEV;
2232
2233 mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2234 if (!mon_dev_name)
2235 goto err_out_mod;
2236
2237 options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2238 if (!options)
2239 goto err_mon_dev;
2240
2241 /* new rbd_device object */
2242 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2243 if (!rbd_dev)
2244 goto err_out_opt;
2245
2246 /* static rbd_device initialization */
2247 spin_lock_init(&rbd_dev->lock);
2248 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002249 INIT_LIST_HEAD(&rbd_dev->snaps);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002250
Alex Elder0e805a12012-01-11 19:42:15 -08002251 init_rwsem(&rbd_dev->header.snap_rwsem);
2252
Alex Elderd184f6b2012-01-29 13:57:44 -06002253 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002254 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002255
2256 /* parse add command */
2257 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
2258 "%" __stringify(RBD_MAX_OPT_LEN) "s "
2259 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
2260 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
2261 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
2262 mon_dev_name, options, rbd_dev->pool_name,
2263 rbd_dev->obj, rbd_dev->snap_name) < 4) {
2264 rc = -EINVAL;
2265 goto err_out_slot;
2266 }
2267
2268 if (rbd_dev->snap_name[0] == 0)
Josh Durgincc9d7342011-11-21 18:19:13 -08002269 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2270 sizeof (RBD_SNAP_HEAD_NAME));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002271
2272 rbd_dev->obj_len = strlen(rbd_dev->obj);
2273 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
2274 rbd_dev->obj, RBD_SUFFIX);
2275
2276 /* initialize rest of new object */
2277 snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
Alex Eldere124a822012-01-29 13:57:44 -06002278
2279 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002280 rc = rbd_get_client(rbd_dev, mon_dev_name, options);
Alex Eldere124a822012-01-29 13:57:44 -06002281 mutex_unlock(&ctl_mutex);
2282
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002283 if (rc < 0)
2284 goto err_out_slot;
2285
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002286 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002287 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002288 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2289 if (rc < 0)
2290 goto err_out_client;
2291 rbd_dev->poolid = rc;
2292
2293 /* register our block device */
2294 irc = register_blkdev(0, rbd_dev->name);
2295 if (irc < 0) {
2296 rc = irc;
2297 goto err_out_client;
2298 }
2299 rbd_dev->major = irc;
2300
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002301 rc = rbd_bus_add_dev(rbd_dev);
2302 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002303 goto err_out_blkdev;
2304
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002305 /* set up and announce blkdev mapping */
2306 rc = rbd_init_disk(rbd_dev);
2307 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002308 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002309
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002310 rc = rbd_init_watch_dev(rbd_dev);
2311 if (rc)
2312 goto err_out_bus;
2313
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002314 return count;
2315
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002316err_out_bus:
Alex Elder499afd52012-02-02 08:13:29 -06002317 rbd_id_put(rbd_dev);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002318
2319 /* this will also clean up rest of rbd_dev stuff */
2320
2321 rbd_bus_del_dev(rbd_dev);
2322 kfree(options);
2323 kfree(mon_dev_name);
2324 return rc;
2325
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002326err_out_blkdev:
2327 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2328err_out_client:
2329 rbd_put_client(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002330err_out_slot:
Alex Elder499afd52012-02-02 08:13:29 -06002331 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002332
2333 kfree(rbd_dev);
2334err_out_opt:
2335 kfree(options);
2336err_mon_dev:
2337 kfree(mon_dev_name);
2338err_out_mod:
2339 dout("Error adding device %s\n", buf);
2340 module_put(THIS_MODULE);
2341 return rc;
2342}
2343
2344static struct rbd_device *__rbd_get_dev(unsigned long id)
2345{
2346 struct list_head *tmp;
2347 struct rbd_device *rbd_dev;
2348
Alex Eldere124a822012-01-29 13:57:44 -06002349 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002350 list_for_each(tmp, &rbd_dev_list) {
2351 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a822012-01-29 13:57:44 -06002352 if (rbd_dev->id == id) {
2353 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002354 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002355 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002356 }
Alex Eldere124a822012-01-29 13:57:44 -06002357 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002358 return NULL;
2359}
2360
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002361static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002362{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002363 struct rbd_device *rbd_dev =
2364 container_of(dev, struct rbd_device, dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002365
Alex Elder1dbb4392012-01-24 10:08:37 -06002366 if (rbd_dev->watch_request) {
2367 struct ceph_client *client = rbd_dev->rbd_client->client;
2368
2369 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002370 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002371 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002372 if (rbd_dev->watch_event)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07002373 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002374
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002375 rbd_put_client(rbd_dev);
2376
2377 /* clean up and free blkdev */
2378 rbd_free_disk(rbd_dev);
2379 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2380 kfree(rbd_dev);
2381
2382 /* release module ref */
2383 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002384}
2385
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002386static ssize_t rbd_remove(struct bus_type *bus,
2387 const char *buf,
2388 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002389{
2390 struct rbd_device *rbd_dev = NULL;
2391 int target_id, rc;
2392 unsigned long ul;
2393 int ret = count;
2394
2395 rc = strict_strtoul(buf, 10, &ul);
2396 if (rc)
2397 return rc;
2398
2399 /* convert to int; abort if we lost anything in the conversion */
2400 target_id = (int) ul;
2401 if (target_id != ul)
2402 return -EINVAL;
2403
2404 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2405
2406 rbd_dev = __rbd_get_dev(target_id);
2407 if (!rbd_dev) {
2408 ret = -ENOENT;
2409 goto done;
2410 }
2411
Alex Elder499afd52012-02-02 08:13:29 -06002412 rbd_id_put(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002413
2414 __rbd_remove_all_snaps(rbd_dev);
2415 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002416
2417done:
2418 mutex_unlock(&ctl_mutex);
2419 return ret;
2420}
2421
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002422static ssize_t rbd_snap_add(struct device *dev,
2423 struct device_attribute *attr,
2424 const char *buf,
2425 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002426{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002427 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2428 int ret;
2429 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002430 if (!name)
2431 return -ENOMEM;
2432
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002433 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002434
2435 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2436
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002437 ret = rbd_header_add_snap(rbd_dev,
2438 name, GFP_KERNEL);
2439 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002440 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002441
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002442 ret = __rbd_update_snaps(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002443 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002444 goto err_unlock;
2445
2446 /* shouldn't hold ctl_mutex when notifying.. notify might
2447 trigger a watch callback that would need to get that mutex */
2448 mutex_unlock(&ctl_mutex);
2449
2450 /* make a best effort, don't error if failed */
2451 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002452
2453 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002454 kfree(name);
2455 return ret;
2456
2457err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002458 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002459 kfree(name);
2460 return ret;
2461}
2462
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002463static struct bus_attribute rbd_bus_attrs[] = {
2464 __ATTR(add, S_IWUSR, NULL, rbd_add),
2465 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002466 __ATTR_NULL
2467};
2468
2469/*
2470 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002471 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002472 */
2473static int rbd_sysfs_init(void)
2474{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002475 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002476
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002477 rbd_bus_type.bus_attrs = rbd_bus_attrs;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002478
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002479 ret = bus_register(&rbd_bus_type);
Alex Elder21079782012-01-24 10:08:36 -06002480 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002481 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002482
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002483 ret = device_register(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002484
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002485 return ret;
2486}
2487
2488static void rbd_sysfs_cleanup(void)
2489{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002490 device_unregister(&rbd_root_dev);
2491 bus_unregister(&rbd_bus_type);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002492}
2493
2494int __init rbd_init(void)
2495{
2496 int rc;
2497
2498 rc = rbd_sysfs_init();
2499 if (rc)
2500 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002501 pr_info("loaded " DRV_NAME_LONG "\n");
2502 return 0;
2503}
2504
2505void __exit rbd_exit(void)
2506{
2507 rbd_sysfs_cleanup();
2508}
2509
2510module_init(rbd_init);
2511module_exit(rbd_exit);
2512
2513MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2514MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2515MODULE_DESCRIPTION("rados block device");
2516
2517/* following authorship retained from original osdblk.c */
2518MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2519
2520MODULE_LICENSE("GPL");