blob: 8ac26ab09aa0b99117052e917346853f672c955b [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderf0f8cef2012-01-29 13:57:44 -060044#define RBD_DRV_NAME "rbd"
45#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070046
47#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
48
Alex Elder21079782012-01-24 10:08:36 -060049#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
Yehuda Sadeh602adf42010-08-12 16:11:25 -070050#define RBD_MAX_POOL_NAME_LEN 64
51#define RBD_MAX_SNAP_NAME_LEN 32
52#define RBD_MAX_OPT_LEN 1024
53
54#define RBD_SNAP_HEAD_NAME "-"
55
56#define DEV_NAME_LEN 32
57
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070058#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
59
Yehuda Sadeh602adf42010-08-12 16:11:25 -070060/*
61 * block device image metadata (in-memory version)
62 */
63struct rbd_image_header {
64 u64 image_size;
65 char block_name[32];
66 __u8 obj_order;
67 __u8 crypt_type;
68 __u8 comp_type;
69 struct rw_semaphore snap_rwsem;
70 struct ceph_snap_context *snapc;
71 size_t snap_names_len;
72 u64 snap_seq;
73 u32 total_snaps;
74
75 char *snap_names;
76 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070077
78 u64 obj_version;
79};
80
81struct rbd_options {
82 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083};
84
85/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060086 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070087 */
88struct rbd_client {
89 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091 struct kref kref;
92 struct list_head node;
93};
94
95/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060096 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -070098struct rbd_req_status {
99 int done;
100 int rc;
101 u64 bytes;
102};
103
104/*
105 * a collection of requests
106 */
107struct rbd_req_coll {
108 int total;
109 int num_done;
110 struct kref kref;
111 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112};
113
Alex Elderf0f8cef2012-01-29 13:57:44 -0600114/*
115 * a single io request
116 */
117struct rbd_request {
118 struct request *rq; /* blk layer request */
119 struct bio *bio; /* cloned bio */
120 struct page **pages; /* list of used pages */
121 u64 len;
122 int coll_index;
123 struct rbd_req_coll *coll;
124};
125
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800126struct rbd_snap {
127 struct device dev;
128 const char *name;
129 size_t size;
130 struct list_head node;
131 u64 id;
132};
133
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700134/*
135 * a single device
136 */
137struct rbd_device {
138 int id; /* blkdev unique id */
139
140 int major; /* blkdev assigned major */
141 struct gendisk *disk; /* blkdev's gendisk and rq */
142 struct request_queue *q;
143
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700144 struct rbd_client *rbd_client;
145
146 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
147
148 spinlock_t lock; /* queue lock */
149
150 struct rbd_image_header header;
151 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
152 int obj_len;
153 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
154 char pool_name[RBD_MAX_POOL_NAME_LEN];
155 int poolid;
156
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700157 struct ceph_osd_event *watch_event;
158 struct ceph_osd_request *watch_request;
159
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700160 char snap_name[RBD_MAX_SNAP_NAME_LEN];
161 u32 cur_snap; /* index+1 of current snapshot within snap context
162 0 - for the head */
163 int read_only;
164
165 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166
167 /* list of snapshots */
168 struct list_head snaps;
169
170 /* sysfs related */
171 struct device dev;
172};
173
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700174static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600175
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700176static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600177static DEFINE_SPINLOCK(rbd_dev_list_lock);
178
Alex Elder432b8582012-01-29 13:57:44 -0600179static LIST_HEAD(rbd_client_list); /* clients */
180static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800182static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
183static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800184static ssize_t rbd_snap_add(struct device *dev,
185 struct device_attribute *attr,
186 const char *buf,
187 size_t count);
188static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700189 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800190
Alex Elderf0f8cef2012-01-29 13:57:44 -0600191static ssize_t rbd_add(struct bus_type *bus, const char *buf,
192 size_t count);
193static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
194 size_t count);
195
196static struct bus_attribute rbd_bus_attrs[] = {
197 __ATTR(add, S_IWUSR, NULL, rbd_add),
198 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
199 __ATTR_NULL
200};
201
202static struct bus_type rbd_bus_type = {
203 .name = "rbd",
204 .bus_attrs = rbd_bus_attrs,
205};
206
207static void rbd_root_dev_release(struct device *dev)
208{
209}
210
211static struct device rbd_root_dev = {
212 .init_name = "rbd",
213 .release = rbd_root_dev_release,
214};
215
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800216
217static struct rbd_device *dev_to_rbd(struct device *dev)
218{
219 return container_of(dev, struct rbd_device, dev);
220}
221
222static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
223{
224 return get_device(&rbd_dev->dev);
225}
226
227static void rbd_put_dev(struct rbd_device *rbd_dev)
228{
229 put_device(&rbd_dev->dev);
230}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700231
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700232static int __rbd_update_snaps(struct rbd_device *rbd_dev);
233
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700234static int rbd_open(struct block_device *bdev, fmode_t mode)
235{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600236 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700237
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800238 rbd_get_dev(rbd_dev);
239
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700240 set_device_ro(bdev, rbd_dev->read_only);
241
242 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
243 return -EROFS;
244
245 return 0;
246}
247
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800248static int rbd_release(struct gendisk *disk, fmode_t mode)
249{
250 struct rbd_device *rbd_dev = disk->private_data;
251
252 rbd_put_dev(rbd_dev);
253
254 return 0;
255}
256
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700257static const struct block_device_operations rbd_bd_ops = {
258 .owner = THIS_MODULE,
259 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800260 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700261};
262
263/*
264 * Initialize an rbd client instance.
265 * We own *opt.
266 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700267static struct rbd_client *rbd_client_create(struct ceph_options *opt,
268 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269{
270 struct rbd_client *rbdc;
271 int ret = -ENOMEM;
272
273 dout("rbd_client_create\n");
274 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
275 if (!rbdc)
276 goto out_opt;
277
278 kref_init(&rbdc->kref);
279 INIT_LIST_HEAD(&rbdc->node);
280
Alex Elderbc534d82012-01-29 13:57:44 -0600281 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
282
Sage Weil6ab00d42011-08-09 09:41:59 -0700283 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600285 goto out_mutex;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400286 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700287
288 ret = ceph_open_session(rbdc->client);
289 if (ret < 0)
290 goto out_err;
291
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700292 rbdc->rbd_opts = rbd_opts;
293
Alex Elder432b8582012-01-29 13:57:44 -0600294 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600296 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
Alex Elderbc534d82012-01-29 13:57:44 -0600298 mutex_unlock(&ctl_mutex);
299
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700300 dout("rbd_client_create created %p\n", rbdc);
301 return rbdc;
302
303out_err:
304 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600305out_mutex:
306 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307 kfree(rbdc);
308out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400309 if (opt)
310 ceph_destroy_options(opt);
311 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700312}
313
314/*
315 * Find a ceph client with specific addr and configuration.
316 */
317static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
318{
319 struct rbd_client *client_node;
320
321 if (opt->flags & CEPH_OPT_NOSHARE)
322 return NULL;
323
324 list_for_each_entry(client_node, &rbd_client_list, node)
325 if (ceph_compare_options(opt, client_node->client) == 0)
326 return client_node;
327 return NULL;
328}
329
330/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700331 * mount options
332 */
333enum {
334 Opt_notify_timeout,
335 Opt_last_int,
336 /* int args above */
337 Opt_last_string,
338 /* string args above */
339};
340
341static match_table_t rbdopt_tokens = {
342 {Opt_notify_timeout, "notify_timeout=%d"},
343 /* int args above */
344 /* string args above */
345 {-1, NULL}
346};
347
348static int parse_rbd_opts_token(char *c, void *private)
349{
350 struct rbd_options *rbdopt = private;
351 substring_t argstr[MAX_OPT_ARGS];
352 int token, intval, ret;
353
Alex Elder21079782012-01-24 10:08:36 -0600354 token = match_token(c, rbdopt_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700355 if (token < 0)
356 return -EINVAL;
357
358 if (token < Opt_last_int) {
359 ret = match_int(&argstr[0], &intval);
360 if (ret < 0) {
361 pr_err("bad mount option arg (not int) "
362 "at '%s'\n", c);
363 return ret;
364 }
365 dout("got int token %d val %d\n", token, intval);
366 } else if (token > Opt_last_int && token < Opt_last_string) {
367 dout("got string token %d val %s\n", token,
368 argstr[0].from);
369 } else {
370 dout("got token %d\n", token);
371 }
372
373 switch (token) {
374 case Opt_notify_timeout:
375 rbdopt->notify_timeout = intval;
376 break;
377 default:
378 BUG_ON(token);
379 }
380 return 0;
381}
382
383/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700384 * Get a ceph client with specific addr and configuration, if one does
385 * not exist create it.
386 */
Alex Elderd720bcb2012-02-02 08:13:30 -0600387static struct rbd_client *rbd_get_client(const char *mon_addr, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700388{
389 struct rbd_client *rbdc;
390 struct ceph_options *opt;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700391 struct rbd_options *rbd_opts;
392
393 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
394 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600395 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700396
397 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700398
Alex Elderee577412012-01-24 10:08:36 -0600399 opt = ceph_parse_options(options, mon_addr,
Alex Elder21079782012-01-24 10:08:36 -0600400 mon_addr + strlen(mon_addr),
401 parse_rbd_opts_token, rbd_opts);
Alex Elderee577412012-01-24 10:08:36 -0600402 if (IS_ERR(opt)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600403 kfree(rbd_opts);
404 return ERR_CAST(opt);
Alex Elderee577412012-01-24 10:08:36 -0600405 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406
Alex Elder432b8582012-01-29 13:57:44 -0600407 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700408 rbdc = __rbd_client_find(opt);
409 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600410 /* using an existing client */
411 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600412 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600413
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700414 ceph_destroy_options(opt);
Alex Elder97bb59a2012-01-24 10:08:36 -0600415 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700416
Alex Elderd720bcb2012-02-02 08:13:30 -0600417 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700418 }
Alex Elder432b8582012-01-29 13:57:44 -0600419 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700420
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700421 rbdc = rbd_client_create(opt, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600422
Alex Elderd720bcb2012-02-02 08:13:30 -0600423 if (IS_ERR(rbdc))
424 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700425
Alex Elderd720bcb2012-02-02 08:13:30 -0600426 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700427}
428
429/*
430 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600431 *
Alex Elder432b8582012-01-29 13:57:44 -0600432 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433 */
434static void rbd_client_release(struct kref *kref)
435{
436 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
437
438 dout("rbd_release_client %p\n", rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439 list_del(&rbdc->node);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440
441 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700442 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443 kfree(rbdc);
444}
445
446/*
447 * Drop reference to ceph client node. If it's not referenced anymore, release
448 * it.
449 */
450static void rbd_put_client(struct rbd_device *rbd_dev)
451{
Alex Elder432b8582012-01-29 13:57:44 -0600452 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
Alex Elder432b8582012-01-29 13:57:44 -0600454 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700455 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456}
457
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700458/*
459 * Destroy requests collection
460 */
461static void rbd_coll_release(struct kref *kref)
462{
463 struct rbd_req_coll *coll =
464 container_of(kref, struct rbd_req_coll, kref);
465
466 dout("rbd_coll_release %p\n", coll);
467 kfree(coll);
468}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469
470/*
471 * Create a new header structure, translate header format from the on-disk
472 * header.
473 */
474static int rbd_header_from_disk(struct rbd_image_header *header,
475 struct rbd_image_header_ondisk *ondisk,
476 int allocated_snaps,
477 gfp_t gfp_flags)
478{
479 int i;
480 u32 snap_count = le32_to_cpu(ondisk->snap_count);
481 int ret = -ENOMEM;
482
Alex Elder21079782012-01-24 10:08:36 -0600483 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800484 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800485
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486 init_rwsem(&header->snap_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700487 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
488 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Alex Elder21079782012-01-24 10:08:36 -0600489 snap_count * sizeof (*ondisk),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700490 gfp_flags);
491 if (!header->snapc)
492 return -ENOMEM;
493 if (snap_count) {
494 header->snap_names = kmalloc(header->snap_names_len,
495 GFP_KERNEL);
496 if (!header->snap_names)
497 goto err_snapc;
498 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
499 GFP_KERNEL);
500 if (!header->snap_sizes)
501 goto err_names;
502 } else {
503 header->snap_names = NULL;
504 header->snap_sizes = NULL;
505 }
506 memcpy(header->block_name, ondisk->block_name,
507 sizeof(ondisk->block_name));
508
509 header->image_size = le64_to_cpu(ondisk->image_size);
510 header->obj_order = ondisk->options.order;
511 header->crypt_type = ondisk->options.crypt_type;
512 header->comp_type = ondisk->options.comp_type;
513
514 atomic_set(&header->snapc->nref, 1);
515 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
516 header->snapc->num_snaps = snap_count;
517 header->total_snaps = snap_count;
518
Alex Elder21079782012-01-24 10:08:36 -0600519 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700520 for (i = 0; i < snap_count; i++) {
521 header->snapc->snaps[i] =
522 le64_to_cpu(ondisk->snaps[i].id);
523 header->snap_sizes[i] =
524 le64_to_cpu(ondisk->snaps[i].image_size);
525 }
526
527 /* copy snapshot names */
528 memcpy(header->snap_names, &ondisk->snaps[i],
529 header->snap_names_len);
530 }
531
532 return 0;
533
534err_names:
535 kfree(header->snap_names);
536err_snapc:
537 kfree(header->snapc);
538 return ret;
539}
540
541static int snap_index(struct rbd_image_header *header, int snap_num)
542{
543 return header->total_snaps - snap_num;
544}
545
546static u64 cur_snap_id(struct rbd_device *rbd_dev)
547{
548 struct rbd_image_header *header = &rbd_dev->header;
549
550 if (!rbd_dev->cur_snap)
551 return 0;
552
553 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
554}
555
556static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
557 u64 *seq, u64 *size)
558{
559 int i;
560 char *p = header->snap_names;
561
562 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
563 if (strcmp(snap_name, p) == 0)
564 break;
565 }
566 if (i == header->total_snaps)
567 return -ENOENT;
568 if (seq)
569 *seq = header->snapc->snaps[i];
570
571 if (size)
572 *size = header->snap_sizes[i];
573
574 return i;
575}
576
Josh Durgincc9d7342011-11-21 18:19:13 -0800577static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578{
579 struct rbd_image_header *header = &dev->header;
580 struct ceph_snap_context *snapc = header->snapc;
581 int ret = -ENOENT;
582
Josh Durgincc9d7342011-11-21 18:19:13 -0800583 BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
584
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 down_write(&header->snap_rwsem);
586
Josh Durgincc9d7342011-11-21 18:19:13 -0800587 if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
588 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700589 if (header->total_snaps)
590 snapc->seq = header->snap_seq;
591 else
592 snapc->seq = 0;
593 dev->cur_snap = 0;
594 dev->read_only = 0;
595 if (size)
596 *size = header->image_size;
597 } else {
Josh Durgincc9d7342011-11-21 18:19:13 -0800598 ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599 if (ret < 0)
600 goto done;
601
602 dev->cur_snap = header->total_snaps - ret;
603 dev->read_only = 1;
604 }
605
606 ret = 0;
607done:
608 up_write(&header->snap_rwsem);
609 return ret;
610}
611
612static void rbd_header_free(struct rbd_image_header *header)
613{
614 kfree(header->snapc);
615 kfree(header->snap_names);
616 kfree(header->snap_sizes);
617}
618
619/*
620 * get the actual striped segment name, offset and length
621 */
622static u64 rbd_get_segment(struct rbd_image_header *header,
623 const char *block_name,
624 u64 ofs, u64 len,
625 char *seg_name, u64 *segofs)
626{
627 u64 seg = ofs >> header->obj_order;
628
629 if (seg_name)
630 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
631 "%s.%012llx", block_name, seg);
632
633 ofs = ofs & ((1 << header->obj_order) - 1);
634 len = min_t(u64, len, (1 << header->obj_order) - ofs);
635
636 if (segofs)
637 *segofs = ofs;
638
639 return len;
640}
641
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700642static int rbd_get_num_segments(struct rbd_image_header *header,
643 u64 ofs, u64 len)
644{
645 u64 start_seg = ofs >> header->obj_order;
646 u64 end_seg = (ofs + len - 1) >> header->obj_order;
647 return end_seg - start_seg + 1;
648}
649
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700651 * returns the size of an object in the image
652 */
653static u64 rbd_obj_bytes(struct rbd_image_header *header)
654{
655 return 1 << header->obj_order;
656}
657
658/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659 * bio helpers
660 */
661
662static void bio_chain_put(struct bio *chain)
663{
664 struct bio *tmp;
665
666 while (chain) {
667 tmp = chain;
668 chain = chain->bi_next;
669 bio_put(tmp);
670 }
671}
672
673/*
674 * zeros a bio chain, starting at specific offset
675 */
676static void zero_bio_chain(struct bio *chain, int start_ofs)
677{
678 struct bio_vec *bv;
679 unsigned long flags;
680 void *buf;
681 int i;
682 int pos = 0;
683
684 while (chain) {
685 bio_for_each_segment(bv, chain, i) {
686 if (pos + bv->bv_len > start_ofs) {
687 int remainder = max(start_ofs - pos, 0);
688 buf = bvec_kmap_irq(bv, &flags);
689 memset(buf + remainder, 0,
690 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200691 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 }
693 pos += bv->bv_len;
694 }
695
696 chain = chain->bi_next;
697 }
698}
699
700/*
701 * bio_chain_clone - clone a chain of bios up to a certain length.
702 * might return a bio_pair that will need to be released.
703 */
704static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
705 struct bio_pair **bp,
706 int len, gfp_t gfpmask)
707{
708 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
709 int total = 0;
710
711 if (*bp) {
712 bio_pair_release(*bp);
713 *bp = NULL;
714 }
715
716 while (old_chain && (total < len)) {
717 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
718 if (!tmp)
719 goto err_out;
720
721 if (total + old_chain->bi_size > len) {
722 struct bio_pair *bp;
723
724 /*
725 * this split can only happen with a single paged bio,
726 * split_bio will BUG_ON if this is not the case
727 */
728 dout("bio_chain_clone split! total=%d remaining=%d"
729 "bi_size=%d\n",
730 (int)total, (int)len-total,
731 (int)old_chain->bi_size);
732
733 /* split the bio. We'll release it either in the next
734 call, or it will have to be released outside */
735 bp = bio_split(old_chain, (len - total) / 512ULL);
736 if (!bp)
737 goto err_out;
738
739 __bio_clone(tmp, &bp->bio1);
740
741 *next = &bp->bio2;
742 } else {
743 __bio_clone(tmp, old_chain);
744 *next = old_chain->bi_next;
745 }
746
747 tmp->bi_bdev = NULL;
748 gfpmask &= ~__GFP_WAIT;
749 tmp->bi_next = NULL;
750
751 if (!new_chain) {
752 new_chain = tail = tmp;
753 } else {
754 tail->bi_next = tmp;
755 tail = tmp;
756 }
757 old_chain = old_chain->bi_next;
758
759 total += tmp->bi_size;
760 }
761
762 BUG_ON(total < len);
763
764 if (tail)
765 tail->bi_next = NULL;
766
767 *old = old_chain;
768
769 return new_chain;
770
771err_out:
772 dout("bio_chain_clone with err\n");
773 bio_chain_put(new_chain);
774 return NULL;
775}
776
777/*
778 * helpers for osd request op vectors.
779 */
780static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
781 int num_ops,
782 int opcode,
783 u32 payload_len)
784{
785 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
786 GFP_NOIO);
787 if (!*ops)
788 return -ENOMEM;
789 (*ops)[0].op = opcode;
790 /*
791 * op extent offset and length will be set later on
792 * in calc_raw_layout()
793 */
794 (*ops)[0].payload_len = payload_len;
795 return 0;
796}
797
798static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
799{
800 kfree(ops);
801}
802
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700803static void rbd_coll_end_req_index(struct request *rq,
804 struct rbd_req_coll *coll,
805 int index,
806 int ret, u64 len)
807{
808 struct request_queue *q;
809 int min, max, i;
810
811 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
812 coll, index, ret, len);
813
814 if (!rq)
815 return;
816
817 if (!coll) {
818 blk_end_request(rq, ret, len);
819 return;
820 }
821
822 q = rq->q;
823
824 spin_lock_irq(q->queue_lock);
825 coll->status[index].done = 1;
826 coll->status[index].rc = ret;
827 coll->status[index].bytes = len;
828 max = min = coll->num_done;
829 while (max < coll->total && coll->status[max].done)
830 max++;
831
832 for (i = min; i<max; i++) {
833 __blk_end_request(rq, coll->status[i].rc,
834 coll->status[i].bytes);
835 coll->num_done++;
836 kref_put(&coll->kref, rbd_coll_release);
837 }
838 spin_unlock_irq(q->queue_lock);
839}
840
841static void rbd_coll_end_req(struct rbd_request *req,
842 int ret, u64 len)
843{
844 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
845}
846
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700847/*
848 * Send ceph osd request
849 */
850static int rbd_do_request(struct request *rq,
851 struct rbd_device *dev,
852 struct ceph_snap_context *snapc,
853 u64 snapid,
854 const char *obj, u64 ofs, u64 len,
855 struct bio *bio,
856 struct page **pages,
857 int num_pages,
858 int flags,
859 struct ceph_osd_req_op *ops,
860 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700861 struct rbd_req_coll *coll,
862 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700863 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700864 struct ceph_msg *msg),
865 struct ceph_osd_request **linger_req,
866 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700867{
868 struct ceph_osd_request *req;
869 struct ceph_file_layout *layout;
870 int ret;
871 u64 bno;
872 struct timespec mtime = CURRENT_TIME;
873 struct rbd_request *req_data;
874 struct ceph_osd_request_head *reqhead;
875 struct rbd_image_header *header = &dev->header;
Alex Elder1dbb4392012-01-24 10:08:37 -0600876 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700877
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700879 if (!req_data) {
880 if (coll)
881 rbd_coll_end_req_index(rq, coll, coll_index,
882 -ENOMEM, len);
883 return -ENOMEM;
884 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700886 if (coll) {
887 req_data->coll = coll;
888 req_data->coll_index = coll_index;
889 }
890
891 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892
893 down_read(&header->snap_rwsem);
894
Alex Elder1dbb4392012-01-24 10:08:37 -0600895 osdc = &dev->rbd_client->client->osdc;
896 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
897 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700898 if (!req) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700899 up_read(&header->snap_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700900 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901 goto done_pages;
902 }
903
904 req->r_callback = rbd_cb;
905
906 req_data->rq = rq;
907 req_data->bio = bio;
908 req_data->pages = pages;
909 req_data->len = len;
910
911 req->r_priv = req_data;
912
913 reqhead = req->r_request->front.iov_base;
914 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
915
916 strncpy(req->r_oid, obj, sizeof(req->r_oid));
917 req->r_oid_len = strlen(req->r_oid);
918
919 layout = &req->r_file_layout;
920 memset(layout, 0, sizeof(*layout));
921 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
922 layout->fl_stripe_count = cpu_to_le32(1);
923 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
924 layout->fl_pg_preferred = cpu_to_le32(-1);
925 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
Alex Elder1dbb4392012-01-24 10:08:37 -0600926 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
927 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700928
929 ceph_osdc_build_request(req, ofs, &len,
930 ops,
931 snapc,
932 &mtime,
933 req->r_oid, req->r_oid_len);
934 up_read(&header->snap_rwsem);
935
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700936 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600937 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700938 *linger_req = req;
939 }
940
Alex Elder1dbb4392012-01-24 10:08:37 -0600941 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700942 if (ret < 0)
943 goto done_err;
944
945 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600946 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700947 if (ver)
948 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700949 dout("reassert_ver=%lld\n",
950 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700951 ceph_osdc_put_request(req);
952 }
953 return ret;
954
955done_err:
956 bio_chain_put(req_data->bio);
957 ceph_osdc_put_request(req);
958done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700959 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700960 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700961 return ret;
962}
963
964/*
965 * Ceph osd op callback
966 */
967static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
968{
969 struct rbd_request *req_data = req->r_priv;
970 struct ceph_osd_reply_head *replyhead;
971 struct ceph_osd_op *op;
972 __s32 rc;
973 u64 bytes;
974 int read_op;
975
976 /* parse reply */
977 replyhead = msg->front.iov_base;
978 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
979 op = (void *)(replyhead + 1);
980 rc = le32_to_cpu(replyhead->result);
981 bytes = le64_to_cpu(op->extent.length);
982 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
983
984 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
985
986 if (rc == -ENOENT && read_op) {
987 zero_bio_chain(req_data->bio, 0);
988 rc = 0;
989 } else if (rc == 0 && read_op && bytes < req_data->len) {
990 zero_bio_chain(req_data->bio, bytes);
991 bytes = req_data->len;
992 }
993
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700994 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995
996 if (req_data->bio)
997 bio_chain_put(req_data->bio);
998
999 ceph_osdc_put_request(req);
1000 kfree(req_data);
1001}
1002
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001003static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1004{
1005 ceph_osdc_put_request(req);
1006}
1007
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001008/*
1009 * Do a synchronous ceph osd operation
1010 */
1011static int rbd_req_sync_op(struct rbd_device *dev,
1012 struct ceph_snap_context *snapc,
1013 u64 snapid,
1014 int opcode,
1015 int flags,
1016 struct ceph_osd_req_op *orig_ops,
1017 int num_reply,
1018 const char *obj,
1019 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001020 char *buf,
1021 struct ceph_osd_request **linger_req,
1022 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023{
1024 int ret;
1025 struct page **pages;
1026 int num_pages;
1027 struct ceph_osd_req_op *ops = orig_ops;
1028 u32 payload_len;
1029
1030 num_pages = calc_pages_for(ofs , len);
1031 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001032 if (IS_ERR(pages))
1033 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001034
1035 if (!orig_ops) {
1036 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1037 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1038 if (ret < 0)
1039 goto done;
1040
1041 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1042 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1043 if (ret < 0)
1044 goto done_ops;
1045 }
1046 }
1047
1048 ret = rbd_do_request(NULL, dev, snapc, snapid,
1049 obj, ofs, len, NULL,
1050 pages, num_pages,
1051 flags,
1052 ops,
1053 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001054 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001055 NULL,
1056 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001057 if (ret < 0)
1058 goto done_ops;
1059
1060 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1061 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1062
1063done_ops:
1064 if (!orig_ops)
1065 rbd_destroy_ops(ops);
1066done:
1067 ceph_release_page_vector(pages, num_pages);
1068 return ret;
1069}
1070
1071/*
1072 * Do an asynchronous ceph osd operation
1073 */
1074static int rbd_do_op(struct request *rq,
1075 struct rbd_device *rbd_dev ,
1076 struct ceph_snap_context *snapc,
1077 u64 snapid,
1078 int opcode, int flags, int num_reply,
1079 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001080 struct bio *bio,
1081 struct rbd_req_coll *coll,
1082 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001083{
1084 char *seg_name;
1085 u64 seg_ofs;
1086 u64 seg_len;
1087 int ret;
1088 struct ceph_osd_req_op *ops;
1089 u32 payload_len;
1090
1091 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1092 if (!seg_name)
1093 return -ENOMEM;
1094
1095 seg_len = rbd_get_segment(&rbd_dev->header,
1096 rbd_dev->header.block_name,
1097 ofs, len,
1098 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001099
1100 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1101
1102 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1103 if (ret < 0)
1104 goto done;
1105
1106 /* we've taken care of segment sizes earlier when we
1107 cloned the bios. We should never have a segment
1108 truncated at this point */
1109 BUG_ON(seg_len < len);
1110
1111 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1112 seg_name, seg_ofs, seg_len,
1113 bio,
1114 NULL, 0,
1115 flags,
1116 ops,
1117 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001118 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001119 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001120
1121 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122done:
1123 kfree(seg_name);
1124 return ret;
1125}
1126
1127/*
1128 * Request async osd write
1129 */
1130static int rbd_req_write(struct request *rq,
1131 struct rbd_device *rbd_dev,
1132 struct ceph_snap_context *snapc,
1133 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001134 struct bio *bio,
1135 struct rbd_req_coll *coll,
1136 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137{
1138 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1139 CEPH_OSD_OP_WRITE,
1140 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1141 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001142 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143}
1144
1145/*
1146 * Request async osd read
1147 */
1148static int rbd_req_read(struct request *rq,
1149 struct rbd_device *rbd_dev,
1150 u64 snapid,
1151 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001152 struct bio *bio,
1153 struct rbd_req_coll *coll,
1154 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001155{
1156 return rbd_do_op(rq, rbd_dev, NULL,
1157 (snapid ? snapid : CEPH_NOSNAP),
1158 CEPH_OSD_OP_READ,
1159 CEPH_OSD_FLAG_READ,
1160 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001161 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162}
1163
1164/*
1165 * Request sync osd read
1166 */
1167static int rbd_req_sync_read(struct rbd_device *dev,
1168 struct ceph_snap_context *snapc,
1169 u64 snapid,
1170 const char *obj,
1171 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001172 char *buf,
1173 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001174{
1175 return rbd_req_sync_op(dev, NULL,
1176 (snapid ? snapid : CEPH_NOSNAP),
1177 CEPH_OSD_OP_READ,
1178 CEPH_OSD_FLAG_READ,
1179 NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001180 1, obj, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181}
1182
1183/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001184 * Request sync osd watch
1185 */
1186static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1187 u64 ver,
1188 u64 notify_id,
1189 const char *obj)
1190{
1191 struct ceph_osd_req_op *ops;
1192 struct page **pages = NULL;
Sage Weil11f77002011-05-12 16:13:54 -07001193 int ret;
1194
1195 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001196 if (ret < 0)
1197 return ret;
1198
1199 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1200 ops[0].watch.cookie = notify_id;
1201 ops[0].watch.flag = 0;
1202
1203 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1204 obj, 0, 0, NULL,
1205 pages, 0,
1206 CEPH_OSD_FLAG_READ,
1207 ops,
1208 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001209 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001210 rbd_simple_req_cb, 0, NULL);
1211
1212 rbd_destroy_ops(ops);
1213 return ret;
1214}
1215
1216static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1217{
1218 struct rbd_device *dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001219 int rc;
1220
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001221 if (!dev)
1222 return;
1223
1224 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1225 notify_id, (int)opcode);
1226 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Sage Weil13143d22011-05-12 16:08:30 -07001227 rc = __rbd_update_snaps(dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001228 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001229 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001230 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
1231 " update snaps: %d\n", dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001232
1233 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1234}
1235
1236/*
1237 * Request sync osd watch
1238 */
1239static int rbd_req_sync_watch(struct rbd_device *dev,
1240 const char *obj,
1241 u64 ver)
1242{
1243 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001244 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001245
1246 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1247 if (ret < 0)
1248 return ret;
1249
1250 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1251 (void *)dev, &dev->watch_event);
1252 if (ret < 0)
1253 goto fail;
1254
1255 ops[0].watch.ver = cpu_to_le64(ver);
1256 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1257 ops[0].watch.flag = 1;
1258
1259 ret = rbd_req_sync_op(dev, NULL,
1260 CEPH_NOSNAP,
1261 0,
1262 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1263 ops,
1264 1, obj, 0, 0, NULL,
1265 &dev->watch_request, NULL);
1266
1267 if (ret < 0)
1268 goto fail_event;
1269
1270 rbd_destroy_ops(ops);
1271 return 0;
1272
1273fail_event:
1274 ceph_osdc_cancel_event(dev->watch_event);
1275 dev->watch_event = NULL;
1276fail:
1277 rbd_destroy_ops(ops);
1278 return ret;
1279}
1280
Yehuda Sadeh79e30572011-07-12 16:56:57 -07001281/*
1282 * Request sync osd unwatch
1283 */
1284static int rbd_req_sync_unwatch(struct rbd_device *dev,
1285 const char *obj)
1286{
1287 struct ceph_osd_req_op *ops;
1288
1289 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1290 if (ret < 0)
1291 return ret;
1292
1293 ops[0].watch.ver = 0;
1294 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1295 ops[0].watch.flag = 0;
1296
1297 ret = rbd_req_sync_op(dev, NULL,
1298 CEPH_NOSNAP,
1299 0,
1300 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1301 ops,
1302 1, obj, 0, 0, NULL, NULL, NULL);
1303
1304 rbd_destroy_ops(ops);
1305 ceph_osdc_cancel_event(dev->watch_event);
1306 dev->watch_event = NULL;
1307 return ret;
1308}
1309
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310struct rbd_notify_info {
1311 struct rbd_device *dev;
1312};
1313
1314static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1315{
1316 struct rbd_device *dev = (struct rbd_device *)data;
1317 if (!dev)
1318 return;
1319
1320 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1321 notify_id, (int)opcode);
1322}
1323
1324/*
1325 * Request sync osd notify
1326 */
1327static int rbd_req_sync_notify(struct rbd_device *dev,
1328 const char *obj)
1329{
1330 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001331 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001332 struct ceph_osd_event *event;
1333 struct rbd_notify_info info;
1334 int payload_len = sizeof(u32) + sizeof(u32);
1335 int ret;
1336
1337 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1338 if (ret < 0)
1339 return ret;
1340
1341 info.dev = dev;
1342
1343 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1344 (void *)&info, &event);
1345 if (ret < 0)
1346 goto fail;
1347
1348 ops[0].watch.ver = 1;
1349 ops[0].watch.flag = 1;
1350 ops[0].watch.cookie = event->cookie;
1351 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1352 ops[0].watch.timeout = 12;
1353
1354 ret = rbd_req_sync_op(dev, NULL,
1355 CEPH_NOSNAP,
1356 0,
1357 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1358 ops,
1359 1, obj, 0, 0, NULL, NULL, NULL);
1360 if (ret < 0)
1361 goto fail_event;
1362
1363 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1364 dout("ceph_osdc_wait_event returned %d\n", ret);
1365 rbd_destroy_ops(ops);
1366 return 0;
1367
1368fail_event:
1369 ceph_osdc_cancel_event(event);
1370fail:
1371 rbd_destroy_ops(ops);
1372 return ret;
1373}
1374
1375/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001376 * Request sync osd read
1377 */
1378static int rbd_req_sync_exec(struct rbd_device *dev,
1379 const char *obj,
1380 const char *cls,
1381 const char *method,
1382 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001383 int len,
1384 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001385{
1386 struct ceph_osd_req_op *ops;
1387 int cls_len = strlen(cls);
1388 int method_len = strlen(method);
1389 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1390 cls_len + method_len + len);
1391 if (ret < 0)
1392 return ret;
1393
1394 ops[0].cls.class_name = cls;
1395 ops[0].cls.class_len = (__u8)cls_len;
1396 ops[0].cls.method_name = method;
1397 ops[0].cls.method_len = (__u8)method_len;
1398 ops[0].cls.argc = 0;
1399 ops[0].cls.indata = data;
1400 ops[0].cls.indata_len = len;
1401
1402 ret = rbd_req_sync_op(dev, NULL,
1403 CEPH_NOSNAP,
1404 0,
1405 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1406 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001407 1, obj, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001408
1409 rbd_destroy_ops(ops);
1410
1411 dout("cls_exec returned %d\n", ret);
1412 return ret;
1413}
1414
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001415static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1416{
1417 struct rbd_req_coll *coll =
1418 kzalloc(sizeof(struct rbd_req_coll) +
1419 sizeof(struct rbd_req_status) * num_reqs,
1420 GFP_ATOMIC);
1421
1422 if (!coll)
1423 return NULL;
1424 coll->total = num_reqs;
1425 kref_init(&coll->kref);
1426 return coll;
1427}
1428
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001429/*
1430 * block device queue callback
1431 */
1432static void rbd_rq_fn(struct request_queue *q)
1433{
1434 struct rbd_device *rbd_dev = q->queuedata;
1435 struct request *rq;
1436 struct bio_pair *bp = NULL;
1437
1438 rq = blk_fetch_request(q);
1439
1440 while (1) {
1441 struct bio *bio;
1442 struct bio *rq_bio, *next_bio = NULL;
1443 bool do_write;
1444 int size, op_size = 0;
1445 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001446 int num_segs, cur_seg = 0;
1447 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001448
1449 /* peek at request from block layer */
1450 if (!rq)
1451 break;
1452
1453 dout("fetched request\n");
1454
1455 /* filter out block requests we don't understand */
1456 if ((rq->cmd_type != REQ_TYPE_FS)) {
1457 __blk_end_request_all(rq, 0);
1458 goto next;
1459 }
1460
1461 /* deduce our operation (read, write) */
1462 do_write = (rq_data_dir(rq) == WRITE);
1463
1464 size = blk_rq_bytes(rq);
1465 ofs = blk_rq_pos(rq) * 512ULL;
1466 rq_bio = rq->bio;
1467 if (do_write && rbd_dev->read_only) {
1468 __blk_end_request_all(rq, -EROFS);
1469 goto next;
1470 }
1471
1472 spin_unlock_irq(q->queue_lock);
1473
1474 dout("%s 0x%x bytes at 0x%llx\n",
1475 do_write ? "write" : "read",
1476 size, blk_rq_pos(rq) * 512ULL);
1477
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001478 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1479 coll = rbd_alloc_coll(num_segs);
1480 if (!coll) {
1481 spin_lock_irq(q->queue_lock);
1482 __blk_end_request_all(rq, -ENOMEM);
1483 goto next;
1484 }
1485
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001486 do {
1487 /* a bio clone to be passed down to OSD req */
1488 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1489 op_size = rbd_get_segment(&rbd_dev->header,
1490 rbd_dev->header.block_name,
1491 ofs, size,
1492 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001493 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001494 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1495 op_size, GFP_ATOMIC);
1496 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001497 rbd_coll_end_req_index(rq, coll, cur_seg,
1498 -ENOMEM, op_size);
1499 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001500 }
1501
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001502
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503 /* init OSD command: write or read */
1504 if (do_write)
1505 rbd_req_write(rq, rbd_dev,
1506 rbd_dev->header.snapc,
1507 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001508 op_size, bio,
1509 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001510 else
1511 rbd_req_read(rq, rbd_dev,
1512 cur_snap_id(rbd_dev),
1513 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514 op_size, bio,
1515 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001516
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001517next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001518 size -= op_size;
1519 ofs += op_size;
1520
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522 rq_bio = next_bio;
1523 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001524 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525
1526 if (bp)
1527 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528 spin_lock_irq(q->queue_lock);
1529next:
1530 rq = blk_fetch_request(q);
1531 }
1532}
1533
1534/*
1535 * a queue callback. Makes sure that we don't create a bio that spans across
1536 * multiple osd objects. One exception would be with a single page bios,
1537 * which we handle later at bio_chain_clone
1538 */
1539static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1540 struct bio_vec *bvec)
1541{
1542 struct rbd_device *rbd_dev = q->queuedata;
1543 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1544 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1545 unsigned int bio_sectors = bmd->bi_size >> 9;
1546 int max;
1547
1548 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1549 + bio_sectors)) << 9;
1550 if (max < 0)
1551 max = 0; /* bio_add cannot handle a negative return */
1552 if (max <= bvec->bv_len && bio_sectors == 0)
1553 return bvec->bv_len;
1554 return max;
1555}
1556
1557static void rbd_free_disk(struct rbd_device *rbd_dev)
1558{
1559 struct gendisk *disk = rbd_dev->disk;
1560
1561 if (!disk)
1562 return;
1563
1564 rbd_header_free(&rbd_dev->header);
1565
1566 if (disk->flags & GENHD_FL_UP)
1567 del_gendisk(disk);
1568 if (disk->queue)
1569 blk_cleanup_queue(disk->queue);
1570 put_disk(disk);
1571}
1572
1573/*
1574 * reload the ondisk the header
1575 */
1576static int rbd_read_header(struct rbd_device *rbd_dev,
1577 struct rbd_image_header *header)
1578{
1579 ssize_t rc;
1580 struct rbd_image_header_ondisk *dh;
1581 int snap_count = 0;
1582 u64 snap_names_len = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001583 u64 ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001584
1585 while (1) {
1586 int len = sizeof(*dh) +
1587 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1588 snap_names_len;
1589
1590 rc = -ENOMEM;
1591 dh = kmalloc(len, GFP_KERNEL);
1592 if (!dh)
1593 return -ENOMEM;
1594
1595 rc = rbd_req_sync_read(rbd_dev,
1596 NULL, CEPH_NOSNAP,
1597 rbd_dev->obj_md_name,
1598 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001599 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001600 if (rc < 0)
1601 goto out_dh;
1602
1603 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001604 if (rc < 0) {
1605 if (rc == -ENXIO) {
1606 pr_warning("unrecognized header format"
1607 " for image %s", rbd_dev->obj);
1608 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001609 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001610 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001611
1612 if (snap_count != header->total_snaps) {
1613 snap_count = header->total_snaps;
1614 snap_names_len = header->snap_names_len;
1615 rbd_header_free(header);
1616 kfree(dh);
1617 continue;
1618 }
1619 break;
1620 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001621 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001622
1623out_dh:
1624 kfree(dh);
1625 return rc;
1626}
1627
1628/*
1629 * create a snapshot
1630 */
1631static int rbd_header_add_snap(struct rbd_device *dev,
1632 const char *snap_name,
1633 gfp_t gfp_flags)
1634{
1635 int name_len = strlen(snap_name);
1636 u64 new_snapid;
1637 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001638 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001639 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001640 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001641
1642 /* we should create a snapshot only if we're pointing at the head */
1643 if (dev->cur_snap)
1644 return -EINVAL;
1645
Alex Elder1dbb4392012-01-24 10:08:37 -06001646 monc = &dev->rbd_client->client->monc;
1647 ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001648 dout("created snapid=%lld\n", new_snapid);
1649 if (ret < 0)
1650 return ret;
1651
1652 data = kmalloc(name_len + 16, gfp_flags);
1653 if (!data)
1654 return -ENOMEM;
1655
Sage Weil916d4d62011-05-12 16:10:50 -07001656 p = data;
1657 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001658
Sage Weil916d4d62011-05-12 16:10:50 -07001659 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1660 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001661
1662 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001663 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001664
Sage Weil916d4d62011-05-12 16:10:50 -07001665 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001666
1667 if (ret < 0)
1668 return ret;
1669
1670 dev->header.snapc->seq = new_snapid;
1671
1672 return 0;
1673bad:
1674 return -ERANGE;
1675}
1676
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001677static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1678{
1679 struct rbd_snap *snap;
1680
1681 while (!list_empty(&rbd_dev->snaps)) {
1682 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1683 __rbd_remove_snap_dev(rbd_dev, snap);
1684 }
1685}
1686
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001687/*
1688 * only read the first part of the ondisk header, without the snaps info
1689 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001690static int __rbd_update_snaps(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001691{
1692 int ret;
1693 struct rbd_image_header h;
1694 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001695 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001696
1697 ret = rbd_read_header(rbd_dev, &h);
1698 if (ret < 0)
1699 return ret;
1700
Sage Weil9db4b3e2011-04-19 22:49:06 -07001701 /* resized? */
1702 set_capacity(rbd_dev->disk, h.image_size / 512ULL);
1703
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001704 down_write(&rbd_dev->header.snap_rwsem);
1705
1706 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001707 if (rbd_dev->header.total_snaps &&
1708 rbd_dev->header.snapc->snaps[0] == snap_seq)
1709 /* pointing at the head, will need to follow that
1710 if head moves */
1711 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001712
1713 kfree(rbd_dev->header.snapc);
1714 kfree(rbd_dev->header.snap_names);
1715 kfree(rbd_dev->header.snap_sizes);
1716
1717 rbd_dev->header.total_snaps = h.total_snaps;
1718 rbd_dev->header.snapc = h.snapc;
1719 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001720 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001721 rbd_dev->header.snap_sizes = h.snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001722 if (follow_seq)
1723 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1724 else
1725 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001726
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001727 ret = __rbd_init_snaps_header(rbd_dev);
1728
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001729 up_write(&rbd_dev->header.snap_rwsem);
1730
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001731 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001732}
1733
1734static int rbd_init_disk(struct rbd_device *rbd_dev)
1735{
1736 struct gendisk *disk;
1737 struct request_queue *q;
1738 int rc;
1739 u64 total_size = 0;
1740
1741 /* contact OSD, request size info about the object being mapped */
1742 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1743 if (rc)
1744 return rc;
1745
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001746 /* no need to lock here, as rbd_dev is not registered yet */
1747 rc = __rbd_init_snaps_header(rbd_dev);
1748 if (rc)
1749 return rc;
1750
Josh Durgincc9d7342011-11-21 18:19:13 -08001751 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001752 if (rc)
1753 return rc;
1754
1755 /* create gendisk info */
1756 rc = -ENOMEM;
1757 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1758 if (!disk)
1759 goto out;
1760
Alex Elderf0f8cef2012-01-29 13:57:44 -06001761 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001762 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001763 disk->major = rbd_dev->major;
1764 disk->first_minor = 0;
1765 disk->fops = &rbd_bd_ops;
1766 disk->private_data = rbd_dev;
1767
1768 /* init rq */
1769 rc = -ENOMEM;
1770 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1771 if (!q)
1772 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001773
1774 /* set io sizes to object size */
1775 blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
1776 blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
1777 blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
1778 blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
1779
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780 blk_queue_merge_bvec(q, rbd_merge_bvec);
1781 disk->queue = q;
1782
1783 q->queuedata = rbd_dev;
1784
1785 rbd_dev->disk = disk;
1786 rbd_dev->q = q;
1787
1788 /* finally, announce the disk to the world */
1789 set_capacity(disk, total_size / 512ULL);
1790 add_disk(disk);
1791
1792 pr_info("%s: added with size 0x%llx\n",
1793 disk->disk_name, (unsigned long long)total_size);
1794 return 0;
1795
1796out_disk:
1797 put_disk(disk);
1798out:
1799 return rc;
1800}
1801
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001802/*
1803 sysfs
1804*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001805
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001806static ssize_t rbd_size_show(struct device *dev,
1807 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001808{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001809 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1810
1811 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812}
1813
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001814static ssize_t rbd_major_show(struct device *dev,
1815 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001816{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001817 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1818
1819 return sprintf(buf, "%d\n", rbd_dev->major);
1820}
1821
1822static ssize_t rbd_client_id_show(struct device *dev,
1823 struct device_attribute *attr, char *buf)
1824{
1825 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1826
Alex Elder1dbb4392012-01-24 10:08:37 -06001827 return sprintf(buf, "client%lld\n",
1828 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001829}
1830
1831static ssize_t rbd_pool_show(struct device *dev,
1832 struct device_attribute *attr, char *buf)
1833{
1834 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1835
1836 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1837}
1838
1839static ssize_t rbd_name_show(struct device *dev,
1840 struct device_attribute *attr, char *buf)
1841{
1842 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1843
1844 return sprintf(buf, "%s\n", rbd_dev->obj);
1845}
1846
1847static ssize_t rbd_snap_show(struct device *dev,
1848 struct device_attribute *attr,
1849 char *buf)
1850{
1851 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1852
1853 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1854}
1855
1856static ssize_t rbd_image_refresh(struct device *dev,
1857 struct device_attribute *attr,
1858 const char *buf,
1859 size_t size)
1860{
1861 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1862 int rc;
1863 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001864
1865 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1866
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001867 rc = __rbd_update_snaps(rbd_dev);
1868 if (rc < 0)
1869 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001870
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001871 mutex_unlock(&ctl_mutex);
1872 return ret;
1873}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001874
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001875static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1876static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1877static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1878static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1879static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1880static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1881static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1882static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001883
1884static struct attribute *rbd_attrs[] = {
1885 &dev_attr_size.attr,
1886 &dev_attr_major.attr,
1887 &dev_attr_client_id.attr,
1888 &dev_attr_pool.attr,
1889 &dev_attr_name.attr,
1890 &dev_attr_current_snap.attr,
1891 &dev_attr_refresh.attr,
1892 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001893 NULL
1894};
1895
1896static struct attribute_group rbd_attr_group = {
1897 .attrs = rbd_attrs,
1898};
1899
1900static const struct attribute_group *rbd_attr_groups[] = {
1901 &rbd_attr_group,
1902 NULL
1903};
1904
1905static void rbd_sysfs_dev_release(struct device *dev)
1906{
1907}
1908
1909static struct device_type rbd_device_type = {
1910 .name = "rbd",
1911 .groups = rbd_attr_groups,
1912 .release = rbd_sysfs_dev_release,
1913};
1914
1915
1916/*
1917 sysfs - snapshots
1918*/
1919
1920static ssize_t rbd_snap_size_show(struct device *dev,
1921 struct device_attribute *attr,
1922 char *buf)
1923{
1924 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1925
1926 return sprintf(buf, "%lld\n", (long long)snap->size);
1927}
1928
1929static ssize_t rbd_snap_id_show(struct device *dev,
1930 struct device_attribute *attr,
1931 char *buf)
1932{
1933 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1934
1935 return sprintf(buf, "%lld\n", (long long)snap->id);
1936}
1937
1938static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1939static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1940
1941static struct attribute *rbd_snap_attrs[] = {
1942 &dev_attr_snap_size.attr,
1943 &dev_attr_snap_id.attr,
1944 NULL,
1945};
1946
1947static struct attribute_group rbd_snap_attr_group = {
1948 .attrs = rbd_snap_attrs,
1949};
1950
1951static void rbd_snap_dev_release(struct device *dev)
1952{
1953 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1954 kfree(snap->name);
1955 kfree(snap);
1956}
1957
1958static const struct attribute_group *rbd_snap_attr_groups[] = {
1959 &rbd_snap_attr_group,
1960 NULL
1961};
1962
1963static struct device_type rbd_snap_device_type = {
1964 .groups = rbd_snap_attr_groups,
1965 .release = rbd_snap_dev_release,
1966};
1967
1968static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1969 struct rbd_snap *snap)
1970{
1971 list_del(&snap->node);
1972 device_unregister(&snap->dev);
1973}
1974
1975static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1976 struct rbd_snap *snap,
1977 struct device *parent)
1978{
1979 struct device *dev = &snap->dev;
1980 int ret;
1981
1982 dev->type = &rbd_snap_device_type;
1983 dev->parent = parent;
1984 dev->release = rbd_snap_dev_release;
1985 dev_set_name(dev, "snap_%s", snap->name);
1986 ret = device_register(dev);
1987
1988 return ret;
1989}
1990
1991static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
1992 int i, const char *name,
1993 struct rbd_snap **snapp)
1994{
1995 int ret;
1996 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
1997 if (!snap)
1998 return -ENOMEM;
1999 snap->name = kstrdup(name, GFP_KERNEL);
2000 snap->size = rbd_dev->header.snap_sizes[i];
2001 snap->id = rbd_dev->header.snapc->snaps[i];
2002 if (device_is_registered(&rbd_dev->dev)) {
2003 ret = rbd_register_snap_dev(rbd_dev, snap,
2004 &rbd_dev->dev);
2005 if (ret < 0)
2006 goto err;
2007 }
2008 *snapp = snap;
2009 return 0;
2010err:
2011 kfree(snap->name);
2012 kfree(snap);
2013 return ret;
2014}
2015
2016/*
2017 * search for the previous snap in a null delimited string list
2018 */
2019const char *rbd_prev_snap_name(const char *name, const char *start)
2020{
2021 if (name < start + 2)
2022 return NULL;
2023
2024 name -= 2;
2025 while (*name) {
2026 if (name == start)
2027 return start;
2028 name--;
2029 }
2030 return name + 1;
2031}
2032
2033/*
2034 * compare the old list of snapshots that we have to what's in the header
2035 * and update it accordingly. Note that the header holds the snapshots
2036 * in a reverse order (from newest to oldest) and we need to go from
2037 * older to new so that we don't get a duplicate snap name when
2038 * doing the process (e.g., removed snapshot and recreated a new
2039 * one with the same name.
2040 */
2041static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2042{
2043 const char *name, *first_name;
2044 int i = rbd_dev->header.total_snaps;
2045 struct rbd_snap *snap, *old_snap = NULL;
2046 int ret;
2047 struct list_head *p, *n;
2048
2049 first_name = rbd_dev->header.snap_names;
2050 name = first_name + rbd_dev->header.snap_names_len;
2051
2052 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2053 u64 cur_id;
2054
2055 old_snap = list_entry(p, struct rbd_snap, node);
2056
2057 if (i)
2058 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2059
2060 if (!i || old_snap->id < cur_id) {
2061 /* old_snap->id was skipped, thus was removed */
2062 __rbd_remove_snap_dev(rbd_dev, old_snap);
2063 continue;
2064 }
2065 if (old_snap->id == cur_id) {
2066 /* we have this snapshot already */
2067 i--;
2068 name = rbd_prev_snap_name(name, first_name);
2069 continue;
2070 }
2071 for (; i > 0;
2072 i--, name = rbd_prev_snap_name(name, first_name)) {
2073 if (!name) {
2074 WARN_ON(1);
2075 return -EINVAL;
2076 }
2077 cur_id = rbd_dev->header.snapc->snaps[i];
2078 /* snapshot removal? handle it above */
2079 if (cur_id >= old_snap->id)
2080 break;
2081 /* a new snapshot */
2082 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2083 if (ret < 0)
2084 return ret;
2085
2086 /* note that we add it backward so using n and not p */
2087 list_add(&snap->node, n);
2088 p = &snap->node;
2089 }
2090 }
2091 /* we're done going over the old snap list, just add what's left */
2092 for (; i > 0; i--) {
2093 name = rbd_prev_snap_name(name, first_name);
2094 if (!name) {
2095 WARN_ON(1);
2096 return -EINVAL;
2097 }
2098 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2099 if (ret < 0)
2100 return ret;
2101 list_add(&snap->node, &rbd_dev->snaps);
2102 }
2103
2104 return 0;
2105}
2106
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002107static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2108{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002109 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002110 struct device *dev;
2111 struct rbd_snap *snap;
2112
2113 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2114 dev = &rbd_dev->dev;
2115
2116 dev->bus = &rbd_bus_type;
2117 dev->type = &rbd_device_type;
2118 dev->parent = &rbd_root_dev;
2119 dev->release = rbd_dev_release;
2120 dev_set_name(dev, "%d", rbd_dev->id);
2121 ret = device_register(dev);
2122 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002123 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002124
2125 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2126 ret = rbd_register_snap_dev(rbd_dev, snap,
2127 &rbd_dev->dev);
2128 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002129 break;
2130 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002131out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002132 mutex_unlock(&ctl_mutex);
2133 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002134}
2135
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002136static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2137{
2138 device_unregister(&rbd_dev->dev);
2139}
2140
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002141static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2142{
2143 int ret, rc;
2144
2145 do {
2146 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2147 rbd_dev->header.obj_version);
2148 if (ret == -ERANGE) {
2149 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2150 rc = __rbd_update_snaps(rbd_dev);
2151 mutex_unlock(&ctl_mutex);
2152 if (rc < 0)
2153 return rc;
2154 }
2155 } while (ret == -ERANGE);
2156
2157 return ret;
2158}
2159
Alex Elder1ddbe942012-01-29 13:57:44 -06002160static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2161
2162/*
Alex Elder499afd52012-02-02 08:13:29 -06002163 * Get a unique rbd identifier for the given new rbd_dev, and add
2164 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002165 */
Alex Elder499afd52012-02-02 08:13:29 -06002166static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002167{
Alex Elder499afd52012-02-02 08:13:29 -06002168 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2169
2170 spin_lock(&rbd_dev_list_lock);
2171 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2172 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002173}
Alex Elderb7f23c32012-01-29 13:57:43 -06002174
Alex Elder1ddbe942012-01-29 13:57:44 -06002175/*
Alex Elder499afd52012-02-02 08:13:29 -06002176 * Remove an rbd_dev from the global list, and record that its
2177 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002178 */
Alex Elder499afd52012-02-02 08:13:29 -06002179static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002180{
Alex Elderd184f6b2012-01-29 13:57:44 -06002181 struct list_head *tmp;
2182 int rbd_id = rbd_dev->id;
2183 int max_id;
2184
2185 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002186
2187 spin_lock(&rbd_dev_list_lock);
2188 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002189
2190 /*
2191 * If the id being "put" is not the current maximum, there
2192 * is nothing special we need to do.
2193 */
2194 if (rbd_id != atomic64_read(&rbd_id_max)) {
2195 spin_unlock(&rbd_dev_list_lock);
2196 return;
2197 }
2198
2199 /*
2200 * We need to update the current maximum id. Search the
2201 * list to find out what it is. We're more likely to find
2202 * the maximum at the end, so search the list backward.
2203 */
2204 max_id = 0;
2205 list_for_each_prev(tmp, &rbd_dev_list) {
2206 struct rbd_device *rbd_dev;
2207
2208 rbd_dev = list_entry(tmp, struct rbd_device, node);
2209 if (rbd_id > max_id)
2210 max_id = rbd_id;
2211 }
Alex Elder499afd52012-02-02 08:13:29 -06002212 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002213
Alex Elder1ddbe942012-01-29 13:57:44 -06002214 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002215 * The max id could have been updated by rbd_id_get(), in
2216 * which case it now accurately reflects the new maximum.
2217 * Be careful not to overwrite the maximum value in that
2218 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002219 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002220 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002221}
2222
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002223static ssize_t rbd_add(struct bus_type *bus,
2224 const char *buf,
2225 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002226{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002227 struct rbd_device *rbd_dev;
Alex Elder27cc2592012-02-02 08:13:30 -06002228 char *mon_dev_name = NULL;
2229 char *options = NULL;
2230 struct ceph_osd_client *osdc;
2231 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002232
2233 if (!try_module_get(THIS_MODULE))
2234 return -ENODEV;
2235
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002236 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2237 if (!rbd_dev)
Alex Elder27cc2592012-02-02 08:13:30 -06002238 goto err_nomem;
2239 mon_dev_name = kmalloc(count, GFP_KERNEL);
2240 if (!mon_dev_name)
2241 goto err_nomem;
2242 options = kmalloc(count, GFP_KERNEL);
2243 if (!options)
2244 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002245
2246 /* static rbd_device initialization */
2247 spin_lock_init(&rbd_dev->lock);
2248 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002249 INIT_LIST_HEAD(&rbd_dev->snaps);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002250
Alex Elder0e805a12012-01-11 19:42:15 -08002251 init_rwsem(&rbd_dev->header.snap_rwsem);
2252
Alex Elderd184f6b2012-01-29 13:57:44 -06002253 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002254 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002255
2256 /* parse add command */
2257 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
2258 "%" __stringify(RBD_MAX_OPT_LEN) "s "
2259 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
2260 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
2261 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
2262 mon_dev_name, options, rbd_dev->pool_name,
2263 rbd_dev->obj, rbd_dev->snap_name) < 4) {
2264 rc = -EINVAL;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002265 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002266 }
2267
2268 if (rbd_dev->snap_name[0] == 0)
Josh Durgincc9d7342011-11-21 18:19:13 -08002269 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2270 sizeof (RBD_SNAP_HEAD_NAME));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002271
2272 rbd_dev->obj_len = strlen(rbd_dev->obj);
2273 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
2274 rbd_dev->obj, RBD_SUFFIX);
2275
2276 /* initialize rest of new object */
Alex Elderf0f8cef2012-01-29 13:57:44 -06002277 snprintf(rbd_dev->name, DEV_NAME_LEN, RBD_DRV_NAME "%d", rbd_dev->id);
Alex Eldere124a822012-01-29 13:57:44 -06002278
Alex Elderd720bcb2012-02-02 08:13:30 -06002279 rbd_dev->rbd_client = rbd_get_client(mon_dev_name, options);
2280 if (IS_ERR(rbd_dev->rbd_client)) {
2281 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002282 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002283 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002284
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002285 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002286 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002287 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2288 if (rc < 0)
2289 goto err_out_client;
2290 rbd_dev->poolid = rc;
2291
2292 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002293 rc = register_blkdev(0, rbd_dev->name);
2294 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002295 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002296 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002297
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002298 rc = rbd_bus_add_dev(rbd_dev);
2299 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002300 goto err_out_blkdev;
2301
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002302 /* set up and announce blkdev mapping */
2303 rc = rbd_init_disk(rbd_dev);
2304 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002305 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002306
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002307 rc = rbd_init_watch_dev(rbd_dev);
2308 if (rc)
2309 goto err_out_bus;
2310
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002311 return count;
2312
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002313err_out_bus:
Alex Elder499afd52012-02-02 08:13:29 -06002314 rbd_id_put(rbd_dev);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002315
2316 /* this will also clean up rest of rbd_dev stuff */
2317
2318 rbd_bus_del_dev(rbd_dev);
2319 kfree(options);
2320 kfree(mon_dev_name);
2321 return rc;
2322
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002323err_out_blkdev:
2324 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2325err_out_client:
2326 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002327err_put_id:
Alex Elder499afd52012-02-02 08:13:29 -06002328 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002329err_nomem:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002330 kfree(options);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002331 kfree(mon_dev_name);
Alex Elder27cc2592012-02-02 08:13:30 -06002332 kfree(rbd_dev);
2333
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002334 dout("Error adding device %s\n", buf);
2335 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002336
2337 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002338}
2339
2340static struct rbd_device *__rbd_get_dev(unsigned long id)
2341{
2342 struct list_head *tmp;
2343 struct rbd_device *rbd_dev;
2344
Alex Eldere124a822012-01-29 13:57:44 -06002345 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002346 list_for_each(tmp, &rbd_dev_list) {
2347 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a822012-01-29 13:57:44 -06002348 if (rbd_dev->id == id) {
2349 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002350 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002351 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002352 }
Alex Eldere124a822012-01-29 13:57:44 -06002353 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002354 return NULL;
2355}
2356
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002357static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002358{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002359 struct rbd_device *rbd_dev =
2360 container_of(dev, struct rbd_device, dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002361
Alex Elder1dbb4392012-01-24 10:08:37 -06002362 if (rbd_dev->watch_request) {
2363 struct ceph_client *client = rbd_dev->rbd_client->client;
2364
2365 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002366 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002367 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002368 if (rbd_dev->watch_event)
Yehuda Sadeh79e30572011-07-12 16:56:57 -07002369 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002370
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002371 rbd_put_client(rbd_dev);
2372
2373 /* clean up and free blkdev */
2374 rbd_free_disk(rbd_dev);
2375 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2376 kfree(rbd_dev);
2377
2378 /* release module ref */
2379 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002380}
2381
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002382static ssize_t rbd_remove(struct bus_type *bus,
2383 const char *buf,
2384 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002385{
2386 struct rbd_device *rbd_dev = NULL;
2387 int target_id, rc;
2388 unsigned long ul;
2389 int ret = count;
2390
2391 rc = strict_strtoul(buf, 10, &ul);
2392 if (rc)
2393 return rc;
2394
2395 /* convert to int; abort if we lost anything in the conversion */
2396 target_id = (int) ul;
2397 if (target_id != ul)
2398 return -EINVAL;
2399
2400 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2401
2402 rbd_dev = __rbd_get_dev(target_id);
2403 if (!rbd_dev) {
2404 ret = -ENOENT;
2405 goto done;
2406 }
2407
Alex Elder499afd52012-02-02 08:13:29 -06002408 rbd_id_put(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002409
2410 __rbd_remove_all_snaps(rbd_dev);
2411 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002412
2413done:
2414 mutex_unlock(&ctl_mutex);
2415 return ret;
2416}
2417
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002418static ssize_t rbd_snap_add(struct device *dev,
2419 struct device_attribute *attr,
2420 const char *buf,
2421 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002422{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002423 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2424 int ret;
2425 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002426 if (!name)
2427 return -ENOMEM;
2428
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002429 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002430
2431 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2432
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002433 ret = rbd_header_add_snap(rbd_dev,
2434 name, GFP_KERNEL);
2435 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002436 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002437
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002438 ret = __rbd_update_snaps(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002439 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002440 goto err_unlock;
2441
2442 /* shouldn't hold ctl_mutex when notifying.. notify might
2443 trigger a watch callback that would need to get that mutex */
2444 mutex_unlock(&ctl_mutex);
2445
2446 /* make a best effort, don't error if failed */
2447 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002448
2449 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002450 kfree(name);
2451 return ret;
2452
2453err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002454 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002455 kfree(name);
2456 return ret;
2457}
2458
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002459/*
2460 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002461 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002462 */
2463static int rbd_sysfs_init(void)
2464{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002465 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002466
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002467 ret = bus_register(&rbd_bus_type);
Alex Elder21079782012-01-24 10:08:36 -06002468 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002469 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002470
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002471 ret = device_register(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002472
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002473 return ret;
2474}
2475
2476static void rbd_sysfs_cleanup(void)
2477{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002478 device_unregister(&rbd_root_dev);
2479 bus_unregister(&rbd_bus_type);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002480}
2481
2482int __init rbd_init(void)
2483{
2484 int rc;
2485
2486 rc = rbd_sysfs_init();
2487 if (rc)
2488 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002489 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002490 return 0;
2491}
2492
2493void __exit rbd_exit(void)
2494{
2495 rbd_sysfs_cleanup();
2496}
2497
2498module_init(rbd_init);
2499module_exit(rbd_exit);
2500
2501MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2502MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2503MODULE_DESCRIPTION("rados block device");
2504
2505/* following authorship retained from original osdblk.c */
2506MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2507
2508MODULE_LICENSE("GPL");