blob: c7681d46cf865bde70216f4e52a60a3ed4dc4a98 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050075
Alex Elderd8891402012-10-09 13:50:17 -070076/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
Alex Elder81a89792012-02-02 08:13:30 -060084/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060091#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Eldercc0538b2012-08-10 13:12:07 -070093#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070094
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
112 u64 obj_version;
113};
114
115struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700116 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117};
118
119/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600120 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600129 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700130 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145};
146
Alex Elderf0f8cef2012-01-29 13:57:44 -0600147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800159struct rbd_snap {
160 struct device dev;
161 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800162 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800163 struct list_head node;
164 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500165 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166};
167
Alex Elderf84344f2012-08-31 17:29:51 -0500168struct rbd_mapping {
169 char *snap_name;
170 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500171 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500172 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500173 bool snap_exists;
174 bool read_only;
175};
176
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700177/*
178 * a single device
179 */
180struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500181 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
183 int major; /* blkdev assigned major */
184 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700185
Alex Eldera30b71b2012-07-10 20:30:11 -0500186 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700187 struct rbd_client *rbd_client;
188
189 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
190
191 spinlock_t lock; /* queue lock */
192
193 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500194 char *image_id;
195 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500196 char *image_name;
197 size_t image_name_len;
198 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500199 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500200 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700201
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700202 struct ceph_osd_event *watch_event;
203 struct ceph_osd_request *watch_request;
204
Josh Durginc6666012011-11-21 17:11:12 -0800205 /* protects updating the header */
206 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500207
208 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700209
210 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800211
212 /* list of snapshots */
213 struct list_head snaps;
214
215 /* sysfs related */
216 struct device dev;
217};
218
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700219static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600220
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700221static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600222static DEFINE_SPINLOCK(rbd_dev_list_lock);
223
Alex Elder432b8582012-01-29 13:57:44 -0600224static LIST_HEAD(rbd_client_list); /* clients */
225static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700226
Alex Elder304f6802012-08-31 17:29:52 -0500227static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
228static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
229
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800230static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500231static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232
Alex Elderf0f8cef2012-01-29 13:57:44 -0600233static ssize_t rbd_add(struct bus_type *bus, const char *buf,
234 size_t count);
235static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
236 size_t count);
237
238static struct bus_attribute rbd_bus_attrs[] = {
239 __ATTR(add, S_IWUSR, NULL, rbd_add),
240 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
241 __ATTR_NULL
242};
243
244static struct bus_type rbd_bus_type = {
245 .name = "rbd",
246 .bus_attrs = rbd_bus_attrs,
247};
248
249static void rbd_root_dev_release(struct device *dev)
250{
251}
252
253static struct device rbd_root_dev = {
254 .init_name = "rbd",
255 .release = rbd_root_dev_release,
256};
257
Alex Elderaafb230e2012-09-06 16:00:54 -0500258#ifdef RBD_DEBUG
259#define rbd_assert(expr) \
260 if (unlikely(!(expr))) { \
261 printk(KERN_ERR "\nAssertion failure in %s() " \
262 "at line %d:\n\n" \
263 "\trbd_assert(%s);\n\n", \
264 __func__, __LINE__, #expr); \
265 BUG(); \
266 }
267#else /* !RBD_DEBUG */
268# define rbd_assert(expr) ((void) 0)
269#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800270
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
272{
273 return get_device(&rbd_dev->dev);
274}
275
276static void rbd_put_dev(struct rbd_device *rbd_dev)
277{
278 put_device(&rbd_dev->dev);
279}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700280
Alex Elder117973f2012-08-31 17:29:55 -0500281static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
282static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700283
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284static int rbd_open(struct block_device *bdev, fmode_t mode)
285{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600286 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700287
Alex Elderf84344f2012-08-31 17:29:51 -0500288 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700289 return -EROFS;
290
Alex Elder340c7a22012-08-10 13:12:07 -0700291 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500292 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700293
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294 return 0;
295}
296
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800297static int rbd_release(struct gendisk *disk, fmode_t mode)
298{
299 struct rbd_device *rbd_dev = disk->private_data;
300
301 rbd_put_dev(rbd_dev);
302
303 return 0;
304}
305
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306static const struct block_device_operations rbd_bd_ops = {
307 .owner = THIS_MODULE,
308 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800309 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310};
311
312/*
313 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500314 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700315 */
Alex Elderf8c38922012-08-10 13:12:07 -0700316static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317{
318 struct rbd_client *rbdc;
319 int ret = -ENOMEM;
320
321 dout("rbd_client_create\n");
322 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
323 if (!rbdc)
324 goto out_opt;
325
326 kref_init(&rbdc->kref);
327 INIT_LIST_HEAD(&rbdc->node);
328
Alex Elderbc534d82012-01-29 13:57:44 -0600329 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
330
Alex Elder43ae4702012-07-03 16:01:18 -0500331 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600333 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500334 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700335
336 ret = ceph_open_session(rbdc->client);
337 if (ret < 0)
338 goto out_err;
339
Alex Elder432b8582012-01-29 13:57:44 -0600340 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700341 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600342 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700343
Alex Elderbc534d82012-01-29 13:57:44 -0600344 mutex_unlock(&ctl_mutex);
345
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346 dout("rbd_client_create created %p\n", rbdc);
347 return rbdc;
348
349out_err:
350 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600351out_mutex:
352 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700353 kfree(rbdc);
354out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500355 if (ceph_opts)
356 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400357 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358}
359
360/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700361 * Find a ceph client with specific addr and configuration. If
362 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700363 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700364static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700365{
366 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700367 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700368
Alex Elder43ae4702012-07-03 16:01:18 -0500369 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700370 return NULL;
371
Alex Elder1f7ba332012-08-10 13:12:07 -0700372 spin_lock(&rbd_client_list_lock);
373 list_for_each_entry(client_node, &rbd_client_list, node) {
374 if (!ceph_compare_options(ceph_opts, client_node->client)) {
375 kref_get(&client_node->kref);
376 found = true;
377 break;
378 }
379 }
380 spin_unlock(&rbd_client_list_lock);
381
382 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383}
384
385/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700386 * mount options
387 */
388enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700389 Opt_last_int,
390 /* int args above */
391 Opt_last_string,
392 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700393 Opt_read_only,
394 Opt_read_write,
395 /* Boolean args above */
396 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700397};
398
Alex Elder43ae4702012-07-03 16:01:18 -0500399static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700400 /* int args above */
401 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500402 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700403 {Opt_read_only, "ro"}, /* Alternate spelling */
404 {Opt_read_write, "read_write"},
405 {Opt_read_write, "rw"}, /* Alternate spelling */
406 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700407 {-1, NULL}
408};
409
410static int parse_rbd_opts_token(char *c, void *private)
411{
Alex Elder43ae4702012-07-03 16:01:18 -0500412 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700413 substring_t argstr[MAX_OPT_ARGS];
414 int token, intval, ret;
415
Alex Elder43ae4702012-07-03 16:01:18 -0500416 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700417 if (token < 0)
418 return -EINVAL;
419
420 if (token < Opt_last_int) {
421 ret = match_int(&argstr[0], &intval);
422 if (ret < 0) {
423 pr_err("bad mount option arg (not int) "
424 "at '%s'\n", c);
425 return ret;
426 }
427 dout("got int token %d val %d\n", token, intval);
428 } else if (token > Opt_last_int && token < Opt_last_string) {
429 dout("got string token %d val %s\n", token,
430 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700431 } else if (token > Opt_last_string && token < Opt_last_bool) {
432 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700433 } else {
434 dout("got token %d\n", token);
435 }
436
437 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700438 case Opt_read_only:
439 rbd_opts->read_only = true;
440 break;
441 case Opt_read_write:
442 rbd_opts->read_only = false;
443 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700444 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500445 rbd_assert(false);
446 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700447 }
448 return 0;
449}
450
451/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 * Get a ceph client with specific addr and configuration, if one does
453 * not exist create it.
454 */
Alex Elderf8c38922012-08-10 13:12:07 -0700455static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
456 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457{
Alex Elder069a4b52012-10-22 11:31:27 -0500458 struct rbd_options rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500459 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700460 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700461
Alex Elder069a4b52012-10-22 11:31:27 -0500462 /* Initialize all rbd options to the defaults */
463
464 rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465
Alex Elder43ae4702012-07-03 16:01:18 -0500466 ceph_opts = ceph_parse_options(options, mon_addr,
467 mon_addr + mon_addr_len,
Alex Elder069a4b52012-10-22 11:31:27 -0500468 parse_rbd_opts_token, &rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700469 if (IS_ERR(ceph_opts))
470 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471
Alex Elder069a4b52012-10-22 11:31:27 -0500472 /* Record the parsed rbd options */
473
474 rbd_dev->mapping.read_only = rbd_opts.read_only;
475
Alex Elder1f7ba332012-08-10 13:12:07 -0700476 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700477 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600478 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500479 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700480 } else {
481 rbdc = rbd_client_create(ceph_opts);
482 if (IS_ERR(rbdc))
483 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700484 }
Alex Elderf8c38922012-08-10 13:12:07 -0700485 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486
Alex Elderf8c38922012-08-10 13:12:07 -0700487 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488}
489
490/*
491 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600492 *
Alex Elder432b8582012-01-29 13:57:44 -0600493 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700494 */
495static void rbd_client_release(struct kref *kref)
496{
497 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
498
499 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500500 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700501 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500502 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503
504 ceph_destroy_client(rbdc->client);
505 kfree(rbdc);
506}
507
508/*
509 * Drop reference to ceph client node. If it's not referenced anymore, release
510 * it.
511 */
512static void rbd_put_client(struct rbd_device *rbd_dev)
513{
514 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
515 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700516}
517
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700518/*
519 * Destroy requests collection
520 */
521static void rbd_coll_release(struct kref *kref)
522{
523 struct rbd_req_coll *coll =
524 container_of(kref, struct rbd_req_coll, kref);
525
526 dout("rbd_coll_release %p\n", coll);
527 kfree(coll);
528}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700529
Alex Eldera30b71b2012-07-10 20:30:11 -0500530static bool rbd_image_format_valid(u32 image_format)
531{
532 return image_format == 1 || image_format == 2;
533}
534
Alex Elder8e94af82012-07-25 09:32:40 -0500535static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
536{
Alex Elder103a1502012-08-02 11:29:45 -0500537 size_t size;
538 u32 snap_count;
539
540 /* The header has to start with the magic rbd header text */
541 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
542 return false;
543
Alex Elderdb2388b2012-10-20 22:17:27 -0500544 /* The bio layer requires at least sector-sized I/O */
545
546 if (ondisk->options.order < SECTOR_SHIFT)
547 return false;
548
549 /* If we use u64 in a few spots we may be able to loosen this */
550
551 if (ondisk->options.order > 8 * sizeof (int) - 1)
552 return false;
553
Alex Elder103a1502012-08-02 11:29:45 -0500554 /*
555 * The size of a snapshot header has to fit in a size_t, and
556 * that limits the number of snapshots.
557 */
558 snap_count = le32_to_cpu(ondisk->snap_count);
559 size = SIZE_MAX - sizeof (struct ceph_snap_context);
560 if (snap_count > size / sizeof (__le64))
561 return false;
562
563 /*
564 * Not only that, but the size of the entire the snapshot
565 * header must also be representable in a size_t.
566 */
567 size -= snap_count * sizeof (__le64);
568 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
569 return false;
570
571 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500572}
573
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700574/*
575 * Create a new header structure, translate header format from the on-disk
576 * header.
577 */
578static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500579 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700580{
Alex Elderccece232012-07-10 20:30:10 -0500581 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500582 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500583 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500584 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585
Alex Elder6a523252012-07-19 17:12:59 -0500586 memset(header, 0, sizeof (*header));
587
Alex Elder103a1502012-08-02 11:29:45 -0500588 snap_count = le32_to_cpu(ondisk->snap_count);
589
Alex Elder58c17b02012-08-23 23:22:06 -0500590 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
591 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500592 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700593 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500594 memcpy(header->object_prefix, ondisk->object_prefix, len);
595 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600596
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500598 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
599
Alex Elder621901d2012-08-23 23:22:06 -0500600 /* Save a copy of the snapshot names */
601
Alex Elderf785cc12012-08-23 23:22:06 -0500602 if (snap_names_len > (u64) SIZE_MAX)
603 return -EIO;
604 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500606 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500607 /*
608 * Note that rbd_dev_v1_header_read() guarantees
609 * the ondisk buffer we're working with has
610 * snap_names_len bytes beyond the end of the
611 * snapshot id array, this memcpy() is safe.
612 */
613 memcpy(header->snap_names, &ondisk->snaps[snap_count],
614 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500615
Alex Elder621901d2012-08-23 23:22:06 -0500616 /* Record each snapshot's size */
617
Alex Elderd2bb24e2012-07-26 23:37:14 -0500618 size = snap_count * sizeof (*header->snap_sizes);
619 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700620 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500621 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500622 for (i = 0; i < snap_count; i++)
623 header->snap_sizes[i] =
624 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700625 } else {
Alex Elderccece232012-07-10 20:30:10 -0500626 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700627 header->snap_names = NULL;
628 header->snap_sizes = NULL;
629 }
Alex Elder849b4262012-07-09 21:04:24 -0500630
Alex Elder34b13182012-07-13 20:35:12 -0500631 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632 header->obj_order = ondisk->options.order;
633 header->crypt_type = ondisk->options.crypt_type;
634 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500635
Alex Elder621901d2012-08-23 23:22:06 -0500636 /* Allocate and fill in the snapshot context */
637
Alex Elderf84344f2012-08-31 17:29:51 -0500638 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500639 size = sizeof (struct ceph_snap_context);
640 size += snap_count * sizeof (header->snapc->snaps[0]);
641 header->snapc = kzalloc(size, GFP_KERNEL);
642 if (!header->snapc)
643 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644
645 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500646 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700647 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500648 for (i = 0; i < snap_count; i++)
649 header->snapc->snaps[i] =
650 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651
652 return 0;
653
Alex Elder6a523252012-07-19 17:12:59 -0500654out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500655 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500656 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500658 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500659 kfree(header->object_prefix);
660 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500661
Alex Elder00f1f362012-02-07 12:03:36 -0600662 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663}
664
Alex Elder8836b992012-08-30 14:42:15 -0500665static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667
Alex Eldere86924a2012-07-10 20:30:11 -0500668 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600669
Alex Eldere86924a2012-07-10 20:30:11 -0500670 list_for_each_entry(snap, &rbd_dev->snaps, node) {
671 if (!strcmp(snap_name, snap->name)) {
672 rbd_dev->mapping.snap_id = snap->id;
673 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500674 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600675
Alex Eldere86924a2012-07-10 20:30:11 -0500676 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600677 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678 }
Alex Eldere86924a2012-07-10 20:30:11 -0500679
Alex Elder00f1f362012-02-07 12:03:36 -0600680 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681}
682
Alex Elder5ed16172012-08-29 17:11:07 -0500683static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700684{
Alex Elder78dc4472012-07-19 08:49:18 -0500685 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686
Alex Elder4e1105a2012-08-31 17:29:52 -0500687 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800688 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500689 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500690 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500691 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500692 rbd_dev->mapping.snap_exists = false;
Alex Eldere86924a2012-07-10 20:30:11 -0500693 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700694 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500695 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696 if (ret < 0)
697 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500698 rbd_dev->mapping.snap_exists = true;
699 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500701 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703 return ret;
704}
705
706static void rbd_header_free(struct rbd_image_header *header)
707{
Alex Elder849b4262012-07-09 21:04:24 -0500708 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500709 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700710 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500711 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500712 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500713 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800714 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500715 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716}
717
Alex Elder65ccfe22012-08-09 10:33:26 -0700718static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719{
Alex Elder65ccfe22012-08-09 10:33:26 -0700720 char *name;
721 u64 segment;
722 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723
Alex Elder65ccfe22012-08-09 10:33:26 -0700724 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
725 if (!name)
726 return NULL;
727 segment = offset >> rbd_dev->header.obj_order;
728 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
729 rbd_dev->header.object_prefix, segment);
730 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
731 pr_err("error formatting segment name for #%llu (%d)\n",
732 segment, ret);
733 kfree(name);
734 name = NULL;
735 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700736
Alex Elder65ccfe22012-08-09 10:33:26 -0700737 return name;
738}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700739
Alex Elder65ccfe22012-08-09 10:33:26 -0700740static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
741{
742 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743
Alex Elder65ccfe22012-08-09 10:33:26 -0700744 return offset & (segment_size - 1);
745}
746
747static u64 rbd_segment_length(struct rbd_device *rbd_dev,
748 u64 offset, u64 length)
749{
750 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
751
752 offset &= segment_size - 1;
753
Alex Elderaafb230e2012-09-06 16:00:54 -0500754 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700755 if (offset + length > segment_size)
756 length = segment_size - offset;
757
758 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700759}
760
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700761static int rbd_get_num_segments(struct rbd_image_header *header,
762 u64 ofs, u64 len)
763{
Alex Elderdf111be2012-08-09 10:33:26 -0700764 u64 start_seg;
765 u64 end_seg;
766
767 if (!len)
768 return 0;
769 if (len - 1 > U64_MAX - ofs)
770 return -ERANGE;
771
772 start_seg = ofs >> header->obj_order;
773 end_seg = (ofs + len - 1) >> header->obj_order;
774
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700775 return end_seg - start_seg + 1;
776}
777
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700778/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700779 * returns the size of an object in the image
780 */
781static u64 rbd_obj_bytes(struct rbd_image_header *header)
782{
783 return 1 << header->obj_order;
784}
785
786/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 * bio helpers
788 */
789
790static void bio_chain_put(struct bio *chain)
791{
792 struct bio *tmp;
793
794 while (chain) {
795 tmp = chain;
796 chain = chain->bi_next;
797 bio_put(tmp);
798 }
799}
800
801/*
802 * zeros a bio chain, starting at specific offset
803 */
804static void zero_bio_chain(struct bio *chain, int start_ofs)
805{
806 struct bio_vec *bv;
807 unsigned long flags;
808 void *buf;
809 int i;
810 int pos = 0;
811
812 while (chain) {
813 bio_for_each_segment(bv, chain, i) {
814 if (pos + bv->bv_len > start_ofs) {
815 int remainder = max(start_ofs - pos, 0);
816 buf = bvec_kmap_irq(bv, &flags);
817 memset(buf + remainder, 0,
818 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200819 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820 }
821 pos += bv->bv_len;
822 }
823
824 chain = chain->bi_next;
825 }
826}
827
828/*
Alex Elderf7760da2012-10-20 22:17:27 -0500829 * Clone a portion of a bio, starting at the given byte offset
830 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700831 */
Alex Elderf7760da2012-10-20 22:17:27 -0500832static struct bio *bio_clone_range(struct bio *bio_src,
833 unsigned int offset,
834 unsigned int len,
835 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700836{
Alex Elderf7760da2012-10-20 22:17:27 -0500837 struct bio_vec *bv;
838 unsigned int resid;
839 unsigned short idx;
840 unsigned int voff;
841 unsigned short end_idx;
842 unsigned short vcnt;
843 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700844
Alex Elderf7760da2012-10-20 22:17:27 -0500845 /* Handle the easy case for the caller */
846
847 if (!offset && len == bio_src->bi_size)
848 return bio_clone(bio_src, gfpmask);
849
850 if (WARN_ON_ONCE(!len))
851 return NULL;
852 if (WARN_ON_ONCE(len > bio_src->bi_size))
853 return NULL;
854 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
855 return NULL;
856
857 /* Find first affected segment... */
858
859 resid = offset;
860 __bio_for_each_segment(bv, bio_src, idx, 0) {
861 if (resid < bv->bv_len)
862 break;
863 resid -= bv->bv_len;
864 }
865 voff = resid;
866
867 /* ...and the last affected segment */
868
869 resid += len;
870 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
871 if (resid <= bv->bv_len)
872 break;
873 resid -= bv->bv_len;
874 }
875 vcnt = end_idx - idx + 1;
876
877 /* Build the clone */
878
879 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
880 if (!bio)
881 return NULL; /* ENOMEM */
882
883 bio->bi_bdev = bio_src->bi_bdev;
884 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
885 bio->bi_rw = bio_src->bi_rw;
886 bio->bi_flags |= 1 << BIO_CLONED;
887
888 /*
889 * Copy over our part of the bio_vec, then update the first
890 * and last (or only) entries.
891 */
892 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
893 vcnt * sizeof (struct bio_vec));
894 bio->bi_io_vec[0].bv_offset += voff;
895 if (vcnt > 1) {
896 bio->bi_io_vec[0].bv_len -= voff;
897 bio->bi_io_vec[vcnt - 1].bv_len = resid;
898 } else {
899 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700900 }
901
Alex Elderf7760da2012-10-20 22:17:27 -0500902 bio->bi_vcnt = vcnt;
903 bio->bi_size = len;
904 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700905
Alex Elderf7760da2012-10-20 22:17:27 -0500906 return bio;
907}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700908
Alex Elderf7760da2012-10-20 22:17:27 -0500909/*
910 * Clone a portion of a bio chain, starting at the given byte offset
911 * into the first bio in the source chain and continuing for the
912 * number of bytes indicated. The result is another bio chain of
913 * exactly the given length, or a null pointer on error.
914 *
915 * The bio_src and offset parameters are both in-out. On entry they
916 * refer to the first source bio and the offset into that bio where
917 * the start of data to be cloned is located.
918 *
919 * On return, bio_src is updated to refer to the bio in the source
920 * chain that contains first un-cloned byte, and *offset will
921 * contain the offset of that byte within that bio.
922 */
923static struct bio *bio_chain_clone_range(struct bio **bio_src,
924 unsigned int *offset,
925 unsigned int len,
926 gfp_t gfpmask)
927{
928 struct bio *bi = *bio_src;
929 unsigned int off = *offset;
930 struct bio *chain = NULL;
931 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700932
Alex Elderf7760da2012-10-20 22:17:27 -0500933 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700934
Alex Elderf7760da2012-10-20 22:17:27 -0500935 if (!bi || off >= bi->bi_size || !len)
936 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700937
Alex Elderf7760da2012-10-20 22:17:27 -0500938 end = &chain;
939 while (len) {
940 unsigned int bi_size;
941 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700942
Alex Elderf7760da2012-10-20 22:17:27 -0500943 if (!bi)
944 goto out_err; /* EINVAL; ran out of bio's */
945 bi_size = min_t(unsigned int, bi->bi_size - off, len);
946 bio = bio_clone_range(bi, off, bi_size, gfpmask);
947 if (!bio)
948 goto out_err; /* ENOMEM */
949
950 *end = bio;
951 end = &bio->bi_next;
952
953 off += bi_size;
954 if (off == bi->bi_size) {
955 bi = bi->bi_next;
956 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700957 }
Alex Elderf7760da2012-10-20 22:17:27 -0500958 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700959 }
Alex Elderf7760da2012-10-20 22:17:27 -0500960 *bio_src = bi;
961 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962
Alex Elderf7760da2012-10-20 22:17:27 -0500963 return chain;
964out_err:
965 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 return NULL;
968}
969
970/*
971 * helpers for osd request op vectors.
972 */
Alex Elder57cfc102012-06-26 12:57:03 -0700973static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
974 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975{
Alex Elder57cfc102012-06-26 12:57:03 -0700976 struct ceph_osd_req_op *ops;
977
978 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
979 if (!ops)
980 return NULL;
981
982 ops[0].op = opcode;
983
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700984 /*
985 * op extent offset and length will be set later on
986 * in calc_raw_layout()
987 */
Alex Elder57cfc102012-06-26 12:57:03 -0700988 ops[0].payload_len = payload_len;
989
990 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700991}
992
993static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
994{
995 kfree(ops);
996}
997
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700998static void rbd_coll_end_req_index(struct request *rq,
999 struct rbd_req_coll *coll,
1000 int index,
1001 int ret, u64 len)
1002{
1003 struct request_queue *q;
1004 int min, max, i;
1005
Alex Elderbd919d42012-07-13 20:35:11 -05001006 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1007 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001008
1009 if (!rq)
1010 return;
1011
1012 if (!coll) {
1013 blk_end_request(rq, ret, len);
1014 return;
1015 }
1016
1017 q = rq->q;
1018
1019 spin_lock_irq(q->queue_lock);
1020 coll->status[index].done = 1;
1021 coll->status[index].rc = ret;
1022 coll->status[index].bytes = len;
1023 max = min = coll->num_done;
1024 while (max < coll->total && coll->status[max].done)
1025 max++;
1026
1027 for (i = min; i<max; i++) {
1028 __blk_end_request(rq, coll->status[i].rc,
1029 coll->status[i].bytes);
1030 coll->num_done++;
1031 kref_put(&coll->kref, rbd_coll_release);
1032 }
1033 spin_unlock_irq(q->queue_lock);
1034}
1035
1036static void rbd_coll_end_req(struct rbd_request *req,
1037 int ret, u64 len)
1038{
1039 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1040}
1041
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001042/*
1043 * Send ceph osd request
1044 */
1045static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001046 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001047 struct ceph_snap_context *snapc,
1048 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001049 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001050 struct bio *bio,
1051 struct page **pages,
1052 int num_pages,
1053 int flags,
1054 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001055 struct rbd_req_coll *coll,
1056 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001057 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001058 struct ceph_msg *msg),
1059 struct ceph_osd_request **linger_req,
1060 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001061{
1062 struct ceph_osd_request *req;
1063 struct ceph_file_layout *layout;
1064 int ret;
1065 u64 bno;
1066 struct timespec mtime = CURRENT_TIME;
1067 struct rbd_request *req_data;
1068 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001069 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001070
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001071 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001072 if (!req_data) {
1073 if (coll)
1074 rbd_coll_end_req_index(rq, coll, coll_index,
1075 -ENOMEM, len);
1076 return -ENOMEM;
1077 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001078
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001079 if (coll) {
1080 req_data->coll = coll;
1081 req_data->coll_index = coll_index;
1082 }
1083
Alex Elderf7760da2012-10-20 22:17:27 -05001084 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1085 object_name, (unsigned long long) ofs,
1086 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001087
Alex Elder0ce1a792012-07-03 16:01:18 -05001088 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001089 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1090 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001091 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001092 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001093 goto done_pages;
1094 }
1095
1096 req->r_callback = rbd_cb;
1097
1098 req_data->rq = rq;
1099 req_data->bio = bio;
1100 req_data->pages = pages;
1101 req_data->len = len;
1102
1103 req->r_priv = req_data;
1104
1105 reqhead = req->r_request->front.iov_base;
1106 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1107
Alex Elderaded07e2012-07-03 16:01:18 -05001108 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109 req->r_oid_len = strlen(req->r_oid);
1110
1111 layout = &req->r_file_layout;
1112 memset(layout, 0, sizeof(*layout));
1113 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1114 layout->fl_stripe_count = cpu_to_le32(1);
1115 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001116 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001117 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1118 req, ops);
1119 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001120
1121 ceph_osdc_build_request(req, ofs, &len,
1122 ops,
1123 snapc,
1124 &mtime,
1125 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001126
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001127 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001128 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001129 *linger_req = req;
1130 }
1131
Alex Elder1dbb4392012-01-24 10:08:37 -06001132 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133 if (ret < 0)
1134 goto done_err;
1135
1136 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001137 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001138 if (ver)
1139 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001140 dout("reassert_ver=%llu\n",
1141 (unsigned long long)
1142 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143 ceph_osdc_put_request(req);
1144 }
1145 return ret;
1146
1147done_err:
1148 bio_chain_put(req_data->bio);
1149 ceph_osdc_put_request(req);
1150done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001151 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001152 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153 return ret;
1154}
1155
1156/*
1157 * Ceph osd op callback
1158 */
1159static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1160{
1161 struct rbd_request *req_data = req->r_priv;
1162 struct ceph_osd_reply_head *replyhead;
1163 struct ceph_osd_op *op;
1164 __s32 rc;
1165 u64 bytes;
1166 int read_op;
1167
1168 /* parse reply */
1169 replyhead = msg->front.iov_base;
1170 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1171 op = (void *)(replyhead + 1);
1172 rc = le32_to_cpu(replyhead->result);
1173 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001174 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001175
Alex Elderbd919d42012-07-13 20:35:11 -05001176 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1177 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178
1179 if (rc == -ENOENT && read_op) {
1180 zero_bio_chain(req_data->bio, 0);
1181 rc = 0;
1182 } else if (rc == 0 && read_op && bytes < req_data->len) {
1183 zero_bio_chain(req_data->bio, bytes);
1184 bytes = req_data->len;
1185 }
1186
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001187 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001188
1189 if (req_data->bio)
1190 bio_chain_put(req_data->bio);
1191
1192 ceph_osdc_put_request(req);
1193 kfree(req_data);
1194}
1195
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001196static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1197{
1198 ceph_osdc_put_request(req);
1199}
1200
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001201/*
1202 * Do a synchronous ceph osd operation
1203 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001204static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205 struct ceph_snap_context *snapc,
1206 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001207 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001208 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001209 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001210 u64 ofs, u64 inbound_size,
1211 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001212 struct ceph_osd_request **linger_req,
1213 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001214{
1215 int ret;
1216 struct page **pages;
1217 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001218
Alex Elderaafb230e2012-09-06 16:00:54 -05001219 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001220
Alex Elderf8d4de62012-07-03 16:01:19 -05001221 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001223 if (IS_ERR(pages))
1224 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001225
Alex Elder0ce1a792012-07-03 16:01:18 -05001226 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001227 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001228 pages, num_pages,
1229 flags,
1230 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001231 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001232 NULL,
1233 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001235 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001236
Alex Elderf8d4de62012-07-03 16:01:19 -05001237 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1238 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001240done:
1241 ceph_release_page_vector(pages, num_pages);
1242 return ret;
1243}
1244
1245/*
1246 * Do an asynchronous ceph osd operation
1247 */
1248static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001249 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001250 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001251 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001252 struct bio *bio,
1253 struct rbd_req_coll *coll,
1254 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255{
1256 char *seg_name;
1257 u64 seg_ofs;
1258 u64 seg_len;
1259 int ret;
1260 struct ceph_osd_req_op *ops;
1261 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001262 int opcode;
1263 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001264 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001265
Alex Elder65ccfe22012-08-09 10:33:26 -07001266 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001267 if (!seg_name)
1268 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001269 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1270 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001271
Alex Elderff2e4bb2012-10-10 18:59:29 -07001272 if (rq_data_dir(rq) == WRITE) {
1273 opcode = CEPH_OSD_OP_WRITE;
1274 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001275 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001276 payload_len = seg_len;
1277 } else {
1278 opcode = CEPH_OSD_OP_READ;
1279 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001280 snapc = NULL;
1281 snapid = rbd_dev->mapping.snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001282 payload_len = 0;
1283 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001284
Alex Elder57cfc102012-06-26 12:57:03 -07001285 ret = -ENOMEM;
1286 ops = rbd_create_rw_ops(1, opcode, payload_len);
1287 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001288 goto done;
1289
1290 /* we've taken care of segment sizes earlier when we
1291 cloned the bios. We should never have a segment
1292 truncated at this point */
Alex Elderaafb230e2012-09-06 16:00:54 -05001293 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001294
1295 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1296 seg_name, seg_ofs, seg_len,
1297 bio,
1298 NULL, 0,
1299 flags,
1300 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001301 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001302 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001303
1304 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001305done:
1306 kfree(seg_name);
1307 return ret;
1308}
1309
1310/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001311 * Request sync osd read
1312 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001313static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001314 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001315 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001316 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001317 char *buf,
1318 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001319{
Alex Elder913d2fd2012-06-26 12:57:03 -07001320 struct ceph_osd_req_op *ops;
1321 int ret;
1322
1323 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1324 if (!ops)
1325 return -ENOMEM;
1326
1327 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001328 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001329 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001330 ops, object_name, ofs, len, buf, NULL, ver);
1331 rbd_destroy_ops(ops);
1332
1333 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001334}
1335
1336/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001337 * Request sync osd watch
1338 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001339static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001340 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001341 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001342{
1343 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001344 int ret;
1345
Alex Elder57cfc102012-06-26 12:57:03 -07001346 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1347 if (!ops)
1348 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001349
Josh Durgina71b8912011-12-05 18:10:44 -08001350 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001351 ops[0].watch.cookie = notify_id;
1352 ops[0].watch.flag = 0;
1353
Alex Elder0ce1a792012-07-03 16:01:18 -05001354 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001355 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001356 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001357 CEPH_OSD_FLAG_READ,
1358 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001359 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001360 rbd_simple_req_cb, 0, NULL);
1361
1362 rbd_destroy_ops(ops);
1363 return ret;
1364}
1365
1366static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1367{
Alex Elder0ce1a792012-07-03 16:01:18 -05001368 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001369 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001370 int rc;
1371
Alex Elder0ce1a792012-07-03 16:01:18 -05001372 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001373 return;
1374
Alex Elderbd919d42012-07-13 20:35:11 -05001375 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1376 rbd_dev->header_name, (unsigned long long) notify_id,
1377 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001378 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001379 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001380 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001381 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001382
Alex Elder7f0a24d2012-07-25 09:32:40 -05001383 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001384}
1385
1386/*
1387 * Request sync osd watch
1388 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001389static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001390{
1391 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001392 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001393 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001394
Alex Elder57cfc102012-06-26 12:57:03 -07001395 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1396 if (!ops)
1397 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001398
1399 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001400 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001401 if (ret < 0)
1402 goto fail;
1403
Alex Elder0e6f3222012-07-25 09:32:40 -05001404 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001405 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001406 ops[0].watch.flag = 1;
1407
Alex Elder0ce1a792012-07-03 16:01:18 -05001408 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001409 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001410 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1411 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001412 rbd_dev->header_name,
1413 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001414 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001415
1416 if (ret < 0)
1417 goto fail_event;
1418
1419 rbd_destroy_ops(ops);
1420 return 0;
1421
1422fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001423 ceph_osdc_cancel_event(rbd_dev->watch_event);
1424 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001425fail:
1426 rbd_destroy_ops(ops);
1427 return ret;
1428}
1429
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001430/*
1431 * Request sync osd unwatch
1432 */
Alex Elder070c6332012-07-25 09:32:41 -05001433static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001434{
1435 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001436 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001437
Alex Elder57cfc102012-06-26 12:57:03 -07001438 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1439 if (!ops)
1440 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001441
1442 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001443 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001444 ops[0].watch.flag = 0;
1445
Alex Elder0ce1a792012-07-03 16:01:18 -05001446 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001447 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001448 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1449 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001450 rbd_dev->header_name,
1451 0, 0, NULL, NULL, NULL);
1452
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001453
1454 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001455 ceph_osdc_cancel_event(rbd_dev->watch_event);
1456 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001457 return ret;
1458}
1459
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001460/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001461 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001462 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001463static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001464 const char *object_name,
1465 const char *class_name,
1466 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001467 const char *outbound,
1468 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001469 char *inbound,
1470 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001471 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001472 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473{
1474 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001475 int class_name_len = strlen(class_name);
1476 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001477 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001478 int ret;
1479
Alex Elder3cb4a682012-06-26 12:57:03 -07001480 /*
1481 * Any input parameters required by the method we're calling
1482 * will be sent along with the class and method names as
1483 * part of the message payload. That data and its size are
1484 * supplied via the indata and indata_len fields (named from
1485 * the perspective of the server side) in the OSD request
1486 * operation.
1487 */
1488 payload_size = class_name_len + method_name_len + outbound_size;
1489 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001490 if (!ops)
1491 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001492
Alex Elderaded07e2012-07-03 16:01:18 -05001493 ops[0].cls.class_name = class_name;
1494 ops[0].cls.class_len = (__u8) class_name_len;
1495 ops[0].cls.method_name = method_name;
1496 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001497 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001498 ops[0].cls.indata = outbound;
1499 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001500
Alex Elder0ce1a792012-07-03 16:01:18 -05001501 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001502 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001503 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001504 object_name, 0, inbound_size, inbound,
1505 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001506
1507 rbd_destroy_ops(ops);
1508
1509 dout("cls_exec returned %d\n", ret);
1510 return ret;
1511}
1512
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001513static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1514{
1515 struct rbd_req_coll *coll =
1516 kzalloc(sizeof(struct rbd_req_coll) +
1517 sizeof(struct rbd_req_status) * num_reqs,
1518 GFP_ATOMIC);
1519
1520 if (!coll)
1521 return NULL;
1522 coll->total = num_reqs;
1523 kref_init(&coll->kref);
1524 return coll;
1525}
1526
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527/*
1528 * block device queue callback
1529 */
1530static void rbd_rq_fn(struct request_queue *q)
1531{
1532 struct rbd_device *rbd_dev = q->queuedata;
1533 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534
Alex Elder00f1f362012-02-07 12:03:36 -06001535 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001536 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001537 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001538 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001540 int num_segs, cur_seg = 0;
1541 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001542 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001543 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001544
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001545 dout("fetched request\n");
1546
1547 /* filter out block requests we don't understand */
1548 if ((rq->cmd_type != REQ_TYPE_FS)) {
1549 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001550 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001551 }
1552
1553 /* deduce our operation (read, write) */
1554 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001555 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001556 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001557 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558 }
1559
1560 spin_unlock_irq(q->queue_lock);
1561
Josh Durgind1d25642011-12-05 14:03:05 -08001562 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001563
Alex Elderf84344f2012-08-31 17:29:51 -05001564 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1565 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001566 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001567 dout("request for non-existent snapshot");
1568 spin_lock_irq(q->queue_lock);
1569 __blk_end_request_all(rq, -ENXIO);
1570 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001571 }
1572
Josh Durgind1d25642011-12-05 14:03:05 -08001573 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1574
1575 up_read(&rbd_dev->header_rwsem);
1576
Alex Elderf7760da2012-10-20 22:17:27 -05001577 size = blk_rq_bytes(rq);
1578 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1579 bio = rq->bio;
1580
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001581 dout("%s 0x%x bytes at 0x%llx\n",
1582 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001583 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001584
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001585 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001586 if (num_segs <= 0) {
1587 spin_lock_irq(q->queue_lock);
1588 __blk_end_request_all(rq, num_segs);
1589 ceph_put_snap_context(snapc);
1590 continue;
1591 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001592 coll = rbd_alloc_coll(num_segs);
1593 if (!coll) {
1594 spin_lock_irq(q->queue_lock);
1595 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001596 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001597 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001598 }
1599
Alex Elderf7760da2012-10-20 22:17:27 -05001600 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001601 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001602 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1603 unsigned int chain_size;
1604 struct bio *bio_chain;
1605
1606 BUG_ON(limit > (u64) UINT_MAX);
1607 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001608 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001609
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001610 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001611
1612 /* Pass a cloned bio chain via an osd request */
1613
1614 bio_chain = bio_chain_clone_range(&bio,
1615 &bio_offset, chain_size,
1616 GFP_ATOMIC);
1617 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001618 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001619 ofs, chain_size,
1620 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001621 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001622 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001623 -ENOMEM, chain_size);
1624 size -= chain_size;
1625 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001626
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001627 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001628 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001629 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001630
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001631 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001632
1633 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001634 }
1635}
1636
1637/*
1638 * a queue callback. Makes sure that we don't create a bio that spans across
1639 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001640 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001641 */
1642static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1643 struct bio_vec *bvec)
1644{
1645 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001646 sector_t sector_offset;
1647 sector_t sectors_per_obj;
1648 sector_t obj_sector_offset;
1649 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001650
Alex Eldere5cfeed22012-10-20 22:17:27 -05001651 /*
1652 * Find how far into its rbd object the partition-relative
1653 * bio start sector is to offset relative to the enclosing
1654 * device.
1655 */
1656 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1657 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1658 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001659
Alex Eldere5cfeed22012-10-20 22:17:27 -05001660 /*
1661 * Compute the number of bytes from that offset to the end
1662 * of the object. Account for what's already used by the bio.
1663 */
1664 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1665 if (ret > bmd->bi_size)
1666 ret -= bmd->bi_size;
1667 else
1668 ret = 0;
1669
1670 /*
1671 * Don't send back more than was asked for. And if the bio
1672 * was empty, let the whole thing through because: "Note
1673 * that a block device *must* allow a single page to be
1674 * added to an empty bio."
1675 */
1676 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1677 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1678 ret = (int) bvec->bv_len;
1679
1680 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001681}
1682
1683static void rbd_free_disk(struct rbd_device *rbd_dev)
1684{
1685 struct gendisk *disk = rbd_dev->disk;
1686
1687 if (!disk)
1688 return;
1689
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001690 if (disk->flags & GENHD_FL_UP)
1691 del_gendisk(disk);
1692 if (disk->queue)
1693 blk_cleanup_queue(disk->queue);
1694 put_disk(disk);
1695}
1696
1697/*
Alex Elder4156d992012-08-02 11:29:46 -05001698 * Read the complete header for the given rbd device.
1699 *
1700 * Returns a pointer to a dynamically-allocated buffer containing
1701 * the complete and validated header. Caller can pass the address
1702 * of a variable that will be filled in with the version of the
1703 * header object at the time it was read.
1704 *
1705 * Returns a pointer-coded errno if a failure occurs.
1706 */
1707static struct rbd_image_header_ondisk *
1708rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1709{
1710 struct rbd_image_header_ondisk *ondisk = NULL;
1711 u32 snap_count = 0;
1712 u64 names_size = 0;
1713 u32 want_count;
1714 int ret;
1715
1716 /*
1717 * The complete header will include an array of its 64-bit
1718 * snapshot ids, followed by the names of those snapshots as
1719 * a contiguous block of NUL-terminated strings. Note that
1720 * the number of snapshots could change by the time we read
1721 * it in, in which case we re-read it.
1722 */
1723 do {
1724 size_t size;
1725
1726 kfree(ondisk);
1727
1728 size = sizeof (*ondisk);
1729 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1730 size += names_size;
1731 ondisk = kmalloc(size, GFP_KERNEL);
1732 if (!ondisk)
1733 return ERR_PTR(-ENOMEM);
1734
1735 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1736 rbd_dev->header_name,
1737 0, size,
1738 (char *) ondisk, version);
1739
1740 if (ret < 0)
1741 goto out_err;
1742 if (WARN_ON((size_t) ret < size)) {
1743 ret = -ENXIO;
1744 pr_warning("short header read for image %s"
1745 " (want %zd got %d)\n",
1746 rbd_dev->image_name, size, ret);
1747 goto out_err;
1748 }
1749 if (!rbd_dev_ondisk_valid(ondisk)) {
1750 ret = -ENXIO;
1751 pr_warning("invalid header for image %s\n",
1752 rbd_dev->image_name);
1753 goto out_err;
1754 }
1755
1756 names_size = le64_to_cpu(ondisk->snap_names_len);
1757 want_count = snap_count;
1758 snap_count = le32_to_cpu(ondisk->snap_count);
1759 } while (snap_count != want_count);
1760
1761 return ondisk;
1762
1763out_err:
1764 kfree(ondisk);
1765
1766 return ERR_PTR(ret);
1767}
1768
1769/*
1770 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001771 */
1772static int rbd_read_header(struct rbd_device *rbd_dev,
1773 struct rbd_image_header *header)
1774{
Alex Elder4156d992012-08-02 11:29:46 -05001775 struct rbd_image_header_ondisk *ondisk;
1776 u64 ver = 0;
1777 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001778
Alex Elder4156d992012-08-02 11:29:46 -05001779 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1780 if (IS_ERR(ondisk))
1781 return PTR_ERR(ondisk);
1782 ret = rbd_header_from_disk(header, ondisk);
1783 if (ret >= 0)
1784 header->obj_version = ver;
1785 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786
Alex Elder4156d992012-08-02 11:29:46 -05001787 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001788}
1789
Alex Elder41f38c22012-10-25 23:34:40 -05001790static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001791{
1792 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001793 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001794
Alex Eldera0593292012-07-19 09:09:27 -05001795 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001796 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001797}
1798
Alex Elder94785542012-10-09 13:50:17 -07001799static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1800{
1801 sector_t size;
1802
1803 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
1804 return;
1805
1806 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1807 dout("setting size to %llu sectors", (unsigned long long) size);
1808 rbd_dev->mapping.size = (u64) size;
1809 set_capacity(rbd_dev->disk, size);
1810}
1811
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812/*
1813 * only read the first part of the ondisk header, without the snaps info
1814 */
Alex Elder117973f2012-08-31 17:29:55 -05001815static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001816{
1817 int ret;
1818 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001819
1820 ret = rbd_read_header(rbd_dev, &h);
1821 if (ret < 0)
1822 return ret;
1823
Josh Durgina51aa0c2011-12-05 10:35:04 -08001824 down_write(&rbd_dev->header_rwsem);
1825
Alex Elder94785542012-10-09 13:50:17 -07001826 /* Update image size, and check for resize of mapped image */
1827 rbd_dev->header.image_size = h.image_size;
1828 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001829
Alex Elder849b4262012-07-09 21:04:24 -05001830 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001831 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001832 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001833 /* osd requests may still refer to snapc */
1834 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001835
Alex Elderb8136232012-07-25 09:32:41 -05001836 if (hver)
1837 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001838 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001839 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840 rbd_dev->header.snapc = h.snapc;
1841 rbd_dev->header.snap_names = h.snap_names;
1842 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001843 /* Free the extra copy of the object prefix */
1844 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1845 kfree(h.object_prefix);
1846
Alex Elder304f6802012-08-31 17:29:52 -05001847 ret = rbd_dev_snaps_update(rbd_dev);
1848 if (!ret)
1849 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001850
Josh Durginc6666012011-11-21 17:11:12 -08001851 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001852
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001853 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001854}
1855
Alex Elder117973f2012-08-31 17:29:55 -05001856static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001857{
1858 int ret;
1859
Alex Elder117973f2012-08-31 17:29:55 -05001860 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001861 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001862 if (rbd_dev->image_format == 1)
1863 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1864 else
1865 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001866 mutex_unlock(&ctl_mutex);
1867
1868 return ret;
1869}
1870
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001871static int rbd_init_disk(struct rbd_device *rbd_dev)
1872{
1873 struct gendisk *disk;
1874 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001875 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001876
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001877 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001878 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1879 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001880 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001881
Alex Elderf0f8cef2012-01-29 13:57:44 -06001882 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001883 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001884 disk->major = rbd_dev->major;
1885 disk->first_minor = 0;
1886 disk->fops = &rbd_bd_ops;
1887 disk->private_data = rbd_dev;
1888
1889 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001890 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1891 if (!q)
1892 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001893
Alex Elder593a9e72012-02-07 12:03:37 -06001894 /* We use the default size, but let's be explicit about it. */
1895 blk_queue_physical_block_size(q, SECTOR_SIZE);
1896
Josh Durgin029bcbd2011-07-22 11:35:23 -07001897 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001898 segment_size = rbd_obj_bytes(&rbd_dev->header);
1899 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1900 blk_queue_max_segment_size(q, segment_size);
1901 blk_queue_io_min(q, segment_size);
1902 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001903
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001904 blk_queue_merge_bvec(q, rbd_merge_bvec);
1905 disk->queue = q;
1906
1907 q->queuedata = rbd_dev;
1908
1909 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001910
Alex Elder12f02942012-08-29 17:11:07 -05001911 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1912
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001913 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001914out_disk:
1915 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001916
1917 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001918}
1919
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001920/*
1921 sysfs
1922*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001923
Alex Elder593a9e72012-02-07 12:03:37 -06001924static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1925{
1926 return container_of(dev, struct rbd_device, dev);
1927}
1928
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001929static ssize_t rbd_size_show(struct device *dev,
1930 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001931{
Alex Elder593a9e72012-02-07 12:03:37 -06001932 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001933 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001934
Josh Durgina51aa0c2011-12-05 10:35:04 -08001935 down_read(&rbd_dev->header_rwsem);
1936 size = get_capacity(rbd_dev->disk);
1937 up_read(&rbd_dev->header_rwsem);
1938
1939 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001940}
1941
Alex Elder34b13182012-07-13 20:35:12 -05001942/*
1943 * Note this shows the features for whatever's mapped, which is not
1944 * necessarily the base image.
1945 */
1946static ssize_t rbd_features_show(struct device *dev,
1947 struct device_attribute *attr, char *buf)
1948{
1949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1950
1951 return sprintf(buf, "0x%016llx\n",
1952 (unsigned long long) rbd_dev->mapping.features);
1953}
1954
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001955static ssize_t rbd_major_show(struct device *dev,
1956 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001957{
Alex Elder593a9e72012-02-07 12:03:37 -06001958 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001959
1960 return sprintf(buf, "%d\n", rbd_dev->major);
1961}
1962
1963static ssize_t rbd_client_id_show(struct device *dev,
1964 struct device_attribute *attr, char *buf)
1965{
Alex Elder593a9e72012-02-07 12:03:37 -06001966 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001967
Alex Elder1dbb4392012-01-24 10:08:37 -06001968 return sprintf(buf, "client%lld\n",
1969 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001970}
1971
1972static ssize_t rbd_pool_show(struct device *dev,
1973 struct device_attribute *attr, char *buf)
1974{
Alex Elder593a9e72012-02-07 12:03:37 -06001975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001976
1977 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1978}
1979
Alex Elder9bb2f332012-07-12 10:46:35 -05001980static ssize_t rbd_pool_id_show(struct device *dev,
1981 struct device_attribute *attr, char *buf)
1982{
1983 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1984
1985 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1986}
1987
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001988static ssize_t rbd_name_show(struct device *dev,
1989 struct device_attribute *attr, char *buf)
1990{
Alex Elder593a9e72012-02-07 12:03:37 -06001991 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001992
Alex Elder0bed54d2012-07-03 16:01:18 -05001993 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001994}
1995
Alex Elder589d30e2012-07-10 20:30:11 -05001996static ssize_t rbd_image_id_show(struct device *dev,
1997 struct device_attribute *attr, char *buf)
1998{
1999 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2000
2001 return sprintf(buf, "%s\n", rbd_dev->image_id);
2002}
2003
Alex Elder34b13182012-07-13 20:35:12 -05002004/*
2005 * Shows the name of the currently-mapped snapshot (or
2006 * RBD_SNAP_HEAD_NAME for the base image).
2007 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002008static ssize_t rbd_snap_show(struct device *dev,
2009 struct device_attribute *attr,
2010 char *buf)
2011{
Alex Elder593a9e72012-02-07 12:03:37 -06002012 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002013
Alex Elderf84344f2012-08-31 17:29:51 -05002014 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002015}
2016
2017static ssize_t rbd_image_refresh(struct device *dev,
2018 struct device_attribute *attr,
2019 const char *buf,
2020 size_t size)
2021{
Alex Elder593a9e72012-02-07 12:03:37 -06002022 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002023 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002024
Alex Elder117973f2012-08-31 17:29:55 -05002025 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002026
2027 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002028}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002029
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002030static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002031static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002032static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2033static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2034static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002035static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002036static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002037static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002038static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2039static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002040
2041static struct attribute *rbd_attrs[] = {
2042 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002043 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002044 &dev_attr_major.attr,
2045 &dev_attr_client_id.attr,
2046 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002047 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002048 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002049 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002050 &dev_attr_current_snap.attr,
2051 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002052 NULL
2053};
2054
2055static struct attribute_group rbd_attr_group = {
2056 .attrs = rbd_attrs,
2057};
2058
2059static const struct attribute_group *rbd_attr_groups[] = {
2060 &rbd_attr_group,
2061 NULL
2062};
2063
2064static void rbd_sysfs_dev_release(struct device *dev)
2065{
2066}
2067
2068static struct device_type rbd_device_type = {
2069 .name = "rbd",
2070 .groups = rbd_attr_groups,
2071 .release = rbd_sysfs_dev_release,
2072};
2073
2074
2075/*
2076 sysfs - snapshots
2077*/
2078
2079static ssize_t rbd_snap_size_show(struct device *dev,
2080 struct device_attribute *attr,
2081 char *buf)
2082{
2083 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2084
Josh Durgin35915382011-12-05 18:25:13 -08002085 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002086}
2087
2088static ssize_t rbd_snap_id_show(struct device *dev,
2089 struct device_attribute *attr,
2090 char *buf)
2091{
2092 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2093
Josh Durgin35915382011-12-05 18:25:13 -08002094 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002095}
2096
Alex Elder34b13182012-07-13 20:35:12 -05002097static ssize_t rbd_snap_features_show(struct device *dev,
2098 struct device_attribute *attr,
2099 char *buf)
2100{
2101 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2102
2103 return sprintf(buf, "0x%016llx\n",
2104 (unsigned long long) snap->features);
2105}
2106
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002107static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2108static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002109static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002110
2111static struct attribute *rbd_snap_attrs[] = {
2112 &dev_attr_snap_size.attr,
2113 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002114 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002115 NULL,
2116};
2117
2118static struct attribute_group rbd_snap_attr_group = {
2119 .attrs = rbd_snap_attrs,
2120};
2121
2122static void rbd_snap_dev_release(struct device *dev)
2123{
2124 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2125 kfree(snap->name);
2126 kfree(snap);
2127}
2128
2129static const struct attribute_group *rbd_snap_attr_groups[] = {
2130 &rbd_snap_attr_group,
2131 NULL
2132};
2133
2134static struct device_type rbd_snap_device_type = {
2135 .groups = rbd_snap_attr_groups,
2136 .release = rbd_snap_dev_release,
2137};
2138
Alex Elder304f6802012-08-31 17:29:52 -05002139static bool rbd_snap_registered(struct rbd_snap *snap)
2140{
2141 bool ret = snap->dev.type == &rbd_snap_device_type;
2142 bool reg = device_is_registered(&snap->dev);
2143
2144 rbd_assert(!ret ^ reg);
2145
2146 return ret;
2147}
2148
Alex Elder41f38c22012-10-25 23:34:40 -05002149static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002150{
2151 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002152 if (device_is_registered(&snap->dev))
2153 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002154}
2155
Alex Elder14e70852012-07-19 09:09:27 -05002156static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002157 struct device *parent)
2158{
2159 struct device *dev = &snap->dev;
2160 int ret;
2161
2162 dev->type = &rbd_snap_device_type;
2163 dev->parent = parent;
2164 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002165 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002166 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2167
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002168 ret = device_register(dev);
2169
2170 return ret;
2171}
2172
Alex Elder4e891e02012-07-10 20:30:10 -05002173static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002174 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002175 u64 snap_id, u64 snap_size,
2176 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002177{
Alex Elder4e891e02012-07-10 20:30:10 -05002178 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002179 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002180
2181 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002182 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002183 return ERR_PTR(-ENOMEM);
2184
2185 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002186 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002187 if (!snap->name)
2188 goto err;
2189
Alex Elderc8d18422012-07-10 20:30:11 -05002190 snap->id = snap_id;
2191 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002192 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002193
2194 return snap;
2195
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002196err:
2197 kfree(snap->name);
2198 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002199
2200 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002201}
2202
Alex Eldercd892122012-07-03 16:01:19 -05002203static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2204 u64 *snap_size, u64 *snap_features)
2205{
2206 char *snap_name;
2207
2208 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2209
2210 *snap_size = rbd_dev->header.snap_sizes[which];
2211 *snap_features = 0; /* No features for v1 */
2212
2213 /* Skip over names until we find the one we are looking for */
2214
2215 snap_name = rbd_dev->header.snap_names;
2216 while (which--)
2217 snap_name += strlen(snap_name) + 1;
2218
2219 return snap_name;
2220}
2221
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002222/*
Alex Elder9d475de2012-07-03 16:01:19 -05002223 * Get the size and object order for an image snapshot, or if
2224 * snap_id is CEPH_NOSNAP, gets this information for the base
2225 * image.
2226 */
2227static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2228 u8 *order, u64 *snap_size)
2229{
2230 __le64 snapid = cpu_to_le64(snap_id);
2231 int ret;
2232 struct {
2233 u8 order;
2234 __le64 size;
2235 } __attribute__ ((packed)) size_buf = { 0 };
2236
2237 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2238 "rbd", "get_size",
2239 (char *) &snapid, sizeof (snapid),
2240 (char *) &size_buf, sizeof (size_buf),
2241 CEPH_OSD_FLAG_READ, NULL);
2242 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2243 if (ret < 0)
2244 return ret;
2245
2246 *order = size_buf.order;
2247 *snap_size = le64_to_cpu(size_buf.size);
2248
2249 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2250 (unsigned long long) snap_id, (unsigned int) *order,
2251 (unsigned long long) *snap_size);
2252
2253 return 0;
2254}
2255
2256static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2257{
2258 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2259 &rbd_dev->header.obj_order,
2260 &rbd_dev->header.image_size);
2261}
2262
Alex Elder1e130192012-07-03 16:01:19 -05002263static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2264{
2265 void *reply_buf;
2266 int ret;
2267 void *p;
2268
2269 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2270 if (!reply_buf)
2271 return -ENOMEM;
2272
2273 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2274 "rbd", "get_object_prefix",
2275 NULL, 0,
2276 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2277 CEPH_OSD_FLAG_READ, NULL);
2278 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2279 if (ret < 0)
2280 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002281 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002282
2283 p = reply_buf;
2284 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2285 p + RBD_OBJ_PREFIX_LEN_MAX,
2286 NULL, GFP_NOIO);
2287
2288 if (IS_ERR(rbd_dev->header.object_prefix)) {
2289 ret = PTR_ERR(rbd_dev->header.object_prefix);
2290 rbd_dev->header.object_prefix = NULL;
2291 } else {
2292 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2293 }
2294
2295out:
2296 kfree(reply_buf);
2297
2298 return ret;
2299}
2300
Alex Elderb1b54022012-07-03 16:01:19 -05002301static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2302 u64 *snap_features)
2303{
2304 __le64 snapid = cpu_to_le64(snap_id);
2305 struct {
2306 __le64 features;
2307 __le64 incompat;
2308 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002309 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002310 int ret;
2311
2312 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2313 "rbd", "get_features",
2314 (char *) &snapid, sizeof (snapid),
2315 (char *) &features_buf, sizeof (features_buf),
2316 CEPH_OSD_FLAG_READ, NULL);
2317 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2318 if (ret < 0)
2319 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002320
2321 incompat = le64_to_cpu(features_buf.incompat);
2322 if (incompat & ~RBD_FEATURES_ALL)
2323 return -ENOTSUPP;
2324
Alex Elderb1b54022012-07-03 16:01:19 -05002325 *snap_features = le64_to_cpu(features_buf.features);
2326
2327 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2328 (unsigned long long) snap_id,
2329 (unsigned long long) *snap_features,
2330 (unsigned long long) le64_to_cpu(features_buf.incompat));
2331
2332 return 0;
2333}
2334
2335static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2336{
2337 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2338 &rbd_dev->header.features);
2339}
2340
Alex Elder6e14b1a2012-07-03 16:01:19 -05002341static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002342{
2343 size_t size;
2344 int ret;
2345 void *reply_buf;
2346 void *p;
2347 void *end;
2348 u64 seq;
2349 u32 snap_count;
2350 struct ceph_snap_context *snapc;
2351 u32 i;
2352
2353 /*
2354 * We'll need room for the seq value (maximum snapshot id),
2355 * snapshot count, and array of that many snapshot ids.
2356 * For now we have a fixed upper limit on the number we're
2357 * prepared to receive.
2358 */
2359 size = sizeof (__le64) + sizeof (__le32) +
2360 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2361 reply_buf = kzalloc(size, GFP_KERNEL);
2362 if (!reply_buf)
2363 return -ENOMEM;
2364
2365 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2366 "rbd", "get_snapcontext",
2367 NULL, 0,
2368 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002369 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002370 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2371 if (ret < 0)
2372 goto out;
2373
2374 ret = -ERANGE;
2375 p = reply_buf;
2376 end = (char *) reply_buf + size;
2377 ceph_decode_64_safe(&p, end, seq, out);
2378 ceph_decode_32_safe(&p, end, snap_count, out);
2379
2380 /*
2381 * Make sure the reported number of snapshot ids wouldn't go
2382 * beyond the end of our buffer. But before checking that,
2383 * make sure the computed size of the snapshot context we
2384 * allocate is representable in a size_t.
2385 */
2386 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2387 / sizeof (u64)) {
2388 ret = -EINVAL;
2389 goto out;
2390 }
2391 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2392 goto out;
2393
2394 size = sizeof (struct ceph_snap_context) +
2395 snap_count * sizeof (snapc->snaps[0]);
2396 snapc = kmalloc(size, GFP_KERNEL);
2397 if (!snapc) {
2398 ret = -ENOMEM;
2399 goto out;
2400 }
2401
2402 atomic_set(&snapc->nref, 1);
2403 snapc->seq = seq;
2404 snapc->num_snaps = snap_count;
2405 for (i = 0; i < snap_count; i++)
2406 snapc->snaps[i] = ceph_decode_64(&p);
2407
2408 rbd_dev->header.snapc = snapc;
2409
2410 dout(" snap context seq = %llu, snap_count = %u\n",
2411 (unsigned long long) seq, (unsigned int) snap_count);
2412
2413out:
2414 kfree(reply_buf);
2415
2416 return 0;
2417}
2418
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002419static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2420{
2421 size_t size;
2422 void *reply_buf;
2423 __le64 snap_id;
2424 int ret;
2425 void *p;
2426 void *end;
2427 size_t snap_name_len;
2428 char *snap_name;
2429
2430 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2431 reply_buf = kmalloc(size, GFP_KERNEL);
2432 if (!reply_buf)
2433 return ERR_PTR(-ENOMEM);
2434
2435 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2436 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2437 "rbd", "get_snapshot_name",
2438 (char *) &snap_id, sizeof (snap_id),
2439 reply_buf, size,
2440 CEPH_OSD_FLAG_READ, NULL);
2441 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2442 if (ret < 0)
2443 goto out;
2444
2445 p = reply_buf;
2446 end = (char *) reply_buf + size;
2447 snap_name_len = 0;
2448 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2449 GFP_KERNEL);
2450 if (IS_ERR(snap_name)) {
2451 ret = PTR_ERR(snap_name);
2452 goto out;
2453 } else {
2454 dout(" snap_id 0x%016llx snap_name = %s\n",
2455 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2456 }
2457 kfree(reply_buf);
2458
2459 return snap_name;
2460out:
2461 kfree(reply_buf);
2462
2463 return ERR_PTR(ret);
2464}
2465
2466static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2467 u64 *snap_size, u64 *snap_features)
2468{
2469 __le64 snap_id;
2470 u8 order;
2471 int ret;
2472
2473 snap_id = rbd_dev->header.snapc->snaps[which];
2474 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2475 if (ret)
2476 return ERR_PTR(ret);
2477 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2478 if (ret)
2479 return ERR_PTR(ret);
2480
2481 return rbd_dev_v2_snap_name(rbd_dev, which);
2482}
2483
2484static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2485 u64 *snap_size, u64 *snap_features)
2486{
2487 if (rbd_dev->image_format == 1)
2488 return rbd_dev_v1_snap_info(rbd_dev, which,
2489 snap_size, snap_features);
2490 if (rbd_dev->image_format == 2)
2491 return rbd_dev_v2_snap_info(rbd_dev, which,
2492 snap_size, snap_features);
2493 return ERR_PTR(-EINVAL);
2494}
2495
Alex Elder117973f2012-08-31 17:29:55 -05002496static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2497{
2498 int ret;
2499 __u8 obj_order;
2500
2501 down_write(&rbd_dev->header_rwsem);
2502
2503 /* Grab old order first, to see if it changes */
2504
2505 obj_order = rbd_dev->header.obj_order,
2506 ret = rbd_dev_v2_image_size(rbd_dev);
2507 if (ret)
2508 goto out;
2509 if (rbd_dev->header.obj_order != obj_order) {
2510 ret = -EIO;
2511 goto out;
2512 }
2513 rbd_update_mapping_size(rbd_dev);
2514
2515 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2516 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2517 if (ret)
2518 goto out;
2519 ret = rbd_dev_snaps_update(rbd_dev);
2520 dout("rbd_dev_snaps_update returned %d\n", ret);
2521 if (ret)
2522 goto out;
2523 ret = rbd_dev_snaps_register(rbd_dev);
2524 dout("rbd_dev_snaps_register returned %d\n", ret);
2525out:
2526 up_write(&rbd_dev->header_rwsem);
2527
2528 return ret;
2529}
2530
Alex Elder9d475de2012-07-03 16:01:19 -05002531/*
Alex Elder35938152012-08-02 11:29:46 -05002532 * Scan the rbd device's current snapshot list and compare it to the
2533 * newly-received snapshot context. Remove any existing snapshots
2534 * not present in the new snapshot context. Add a new snapshot for
2535 * any snaphots in the snapshot context not in the current list.
2536 * And verify there are no changes to snapshots we already know
2537 * about.
2538 *
2539 * Assumes the snapshots in the snapshot context are sorted by
2540 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2541 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002542 */
Alex Elder304f6802012-08-31 17:29:52 -05002543static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002544{
Alex Elder35938152012-08-02 11:29:46 -05002545 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2546 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002547 struct list_head *head = &rbd_dev->snaps;
2548 struct list_head *links = head->next;
2549 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002550
Alex Elder9fcbb802012-08-23 23:48:49 -05002551 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002552 while (index < snap_count || links != head) {
2553 u64 snap_id;
2554 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002555 char *snap_name;
2556 u64 snap_size = 0;
2557 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002558
Alex Elder35938152012-08-02 11:29:46 -05002559 snap_id = index < snap_count ? snapc->snaps[index]
2560 : CEPH_NOSNAP;
2561 snap = links != head ? list_entry(links, struct rbd_snap, node)
2562 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05002563 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002564
Alex Elder35938152012-08-02 11:29:46 -05002565 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2566 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002567
Alex Elder35938152012-08-02 11:29:46 -05002568 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002569
Alex Elderf84344f2012-08-31 17:29:51 -05002570 if (rbd_dev->mapping.snap_id == snap->id)
2571 rbd_dev->mapping.snap_exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002572 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002573 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002574 rbd_dev->mapping.snap_id == snap->id ?
2575 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002576 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002577
Alex Elder35938152012-08-02 11:29:46 -05002578 /* Done with this list entry; advance */
2579
2580 links = next;
2581 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002582 }
Alex Elder35938152012-08-02 11:29:46 -05002583
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002584 snap_name = rbd_dev_snap_info(rbd_dev, index,
2585 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002586 if (IS_ERR(snap_name))
2587 return PTR_ERR(snap_name);
2588
Alex Elder9fcbb802012-08-23 23:48:49 -05002589 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2590 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002591 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2592 struct rbd_snap *new_snap;
2593
2594 /* We haven't seen this snapshot before */
2595
Alex Elderc8d18422012-07-10 20:30:11 -05002596 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002597 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002598 if (IS_ERR(new_snap)) {
2599 int err = PTR_ERR(new_snap);
2600
2601 dout(" failed to add dev, error %d\n", err);
2602
2603 return err;
2604 }
Alex Elder35938152012-08-02 11:29:46 -05002605
2606 /* New goes before existing, or at end of list */
2607
Alex Elder9fcbb802012-08-23 23:48:49 -05002608 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002609 if (snap)
2610 list_add_tail(&new_snap->node, &snap->node);
2611 else
Alex Elder523f3252012-08-30 00:16:37 -05002612 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002613 } else {
2614 /* Already have this one */
2615
Alex Elder9fcbb802012-08-23 23:48:49 -05002616 dout(" already present\n");
2617
Alex Eldercd892122012-07-03 16:01:19 -05002618 rbd_assert(snap->size == snap_size);
Alex Elderaafb230e2012-09-06 16:00:54 -05002619 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002620 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002621
2622 /* Done with this list entry; advance */
2623
2624 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002625 }
Alex Elder35938152012-08-02 11:29:46 -05002626
2627 /* Advance to the next entry in the snapshot context */
2628
2629 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002630 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002631 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002632
2633 return 0;
2634}
2635
Alex Elder304f6802012-08-31 17:29:52 -05002636/*
2637 * Scan the list of snapshots and register the devices for any that
2638 * have not already been registered.
2639 */
2640static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2641{
2642 struct rbd_snap *snap;
2643 int ret = 0;
2644
2645 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002646 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2647 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002648
2649 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2650 if (!rbd_snap_registered(snap)) {
2651 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2652 if (ret < 0)
2653 break;
2654 }
2655 }
2656 dout("%s: returning %d\n", __func__, ret);
2657
2658 return ret;
2659}
2660
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002661static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2662{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002663 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002664 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002665
2666 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002667
Alex Eldercd789ab2012-08-30 00:16:38 -05002668 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002669 dev->bus = &rbd_bus_type;
2670 dev->type = &rbd_device_type;
2671 dev->parent = &rbd_root_dev;
2672 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002673 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002674 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002675
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002676 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002677
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002678 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002679}
2680
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002681static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2682{
2683 device_unregister(&rbd_dev->dev);
2684}
2685
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002686static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2687{
2688 int ret, rc;
2689
2690 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002691 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002692 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002693 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002694 if (rc < 0)
2695 return rc;
2696 }
2697 } while (ret == -ERANGE);
2698
2699 return ret;
2700}
2701
Alex Eldere2839302012-08-29 17:11:06 -05002702static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002703
2704/*
Alex Elder499afd52012-02-02 08:13:29 -06002705 * Get a unique rbd identifier for the given new rbd_dev, and add
2706 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002707 */
Alex Eldere2839302012-08-29 17:11:06 -05002708static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002709{
Alex Eldere2839302012-08-29 17:11:06 -05002710 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002711
2712 spin_lock(&rbd_dev_list_lock);
2713 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2714 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002715 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2716 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002717}
Alex Elderb7f23c32012-01-29 13:57:43 -06002718
Alex Elder1ddbe942012-01-29 13:57:44 -06002719/*
Alex Elder499afd52012-02-02 08:13:29 -06002720 * Remove an rbd_dev from the global list, and record that its
2721 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002722 */
Alex Eldere2839302012-08-29 17:11:06 -05002723static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002724{
Alex Elderd184f6b2012-01-29 13:57:44 -06002725 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002726 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002727 int max_id;
2728
Alex Elderaafb230e2012-09-06 16:00:54 -05002729 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002730
Alex Eldere2839302012-08-29 17:11:06 -05002731 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2732 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002733 spin_lock(&rbd_dev_list_lock);
2734 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002735
2736 /*
2737 * If the id being "put" is not the current maximum, there
2738 * is nothing special we need to do.
2739 */
Alex Eldere2839302012-08-29 17:11:06 -05002740 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002741 spin_unlock(&rbd_dev_list_lock);
2742 return;
2743 }
2744
2745 /*
2746 * We need to update the current maximum id. Search the
2747 * list to find out what it is. We're more likely to find
2748 * the maximum at the end, so search the list backward.
2749 */
2750 max_id = 0;
2751 list_for_each_prev(tmp, &rbd_dev_list) {
2752 struct rbd_device *rbd_dev;
2753
2754 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002755 if (rbd_dev->dev_id > max_id)
2756 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002757 }
Alex Elder499afd52012-02-02 08:13:29 -06002758 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002759
Alex Elder1ddbe942012-01-29 13:57:44 -06002760 /*
Alex Eldere2839302012-08-29 17:11:06 -05002761 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002762 * which case it now accurately reflects the new maximum.
2763 * Be careful not to overwrite the maximum value in that
2764 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002765 */
Alex Eldere2839302012-08-29 17:11:06 -05002766 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2767 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002768}
2769
Alex Eldera725f65e2012-02-02 08:13:30 -06002770/*
Alex Eldere28fff262012-02-02 08:13:30 -06002771 * Skips over white space at *buf, and updates *buf to point to the
2772 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002773 * the token (string of non-white space characters) found. Note
2774 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002775 */
2776static inline size_t next_token(const char **buf)
2777{
2778 /*
2779 * These are the characters that produce nonzero for
2780 * isspace() in the "C" and "POSIX" locales.
2781 */
2782 const char *spaces = " \f\n\r\t\v";
2783
2784 *buf += strspn(*buf, spaces); /* Find start of token */
2785
2786 return strcspn(*buf, spaces); /* Return token length */
2787}
2788
2789/*
2790 * Finds the next token in *buf, and if the provided token buffer is
2791 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002792 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2793 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002794 *
2795 * Returns the length of the token found (not including the '\0').
2796 * Return value will be 0 if no token is found, and it will be >=
2797 * token_size if the token would not fit.
2798 *
Alex Elder593a9e72012-02-07 12:03:37 -06002799 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002800 * found token. Note that this occurs even if the token buffer is
2801 * too small to hold it.
2802 */
2803static inline size_t copy_token(const char **buf,
2804 char *token,
2805 size_t token_size)
2806{
2807 size_t len;
2808
2809 len = next_token(buf);
2810 if (len < token_size) {
2811 memcpy(token, *buf, len);
2812 *(token + len) = '\0';
2813 }
2814 *buf += len;
2815
2816 return len;
2817}
2818
2819/*
Alex Elderea3352f2012-07-09 21:04:23 -05002820 * Finds the next token in *buf, dynamically allocates a buffer big
2821 * enough to hold a copy of it, and copies the token into the new
2822 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2823 * that a duplicate buffer is created even for a zero-length token.
2824 *
2825 * Returns a pointer to the newly-allocated duplicate, or a null
2826 * pointer if memory for the duplicate was not available. If
2827 * the lenp argument is a non-null pointer, the length of the token
2828 * (not including the '\0') is returned in *lenp.
2829 *
2830 * If successful, the *buf pointer will be updated to point beyond
2831 * the end of the found token.
2832 *
2833 * Note: uses GFP_KERNEL for allocation.
2834 */
2835static inline char *dup_token(const char **buf, size_t *lenp)
2836{
2837 char *dup;
2838 size_t len;
2839
2840 len = next_token(buf);
2841 dup = kmalloc(len + 1, GFP_KERNEL);
2842 if (!dup)
2843 return NULL;
2844
2845 memcpy(dup, *buf, len);
2846 *(dup + len) = '\0';
2847 *buf += len;
2848
2849 if (lenp)
2850 *lenp = len;
2851
2852 return dup;
2853}
2854
2855/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002856 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2857 * rbd_md_name, and name fields of the given rbd_dev, based on the
2858 * list of monitor addresses and other options provided via
2859 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2860 * copy of the snapshot name to map if successful, or a
2861 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002862 *
2863 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002864 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002865static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2866 const char *buf,
2867 const char **mon_addrs,
2868 size_t *mon_addrs_size,
2869 char *options,
2870 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002871{
Alex Elderd22f76e2012-07-12 10:46:35 -05002872 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002873 char *err_ptr = ERR_PTR(-EINVAL);
2874 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002875
2876 /* The first four tokens are required */
2877
Alex Elder7ef32142012-02-02 08:13:30 -06002878 len = next_token(&buf);
2879 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002880 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002881 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002882 *mon_addrs = buf;
2883
2884 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002885
Alex Eldere28fff262012-02-02 08:13:30 -06002886 len = copy_token(&buf, options, options_size);
2887 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002888 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002889
Alex Elder3feeb8942012-08-31 17:29:52 -05002890 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002891 rbd_dev->pool_name = dup_token(&buf, NULL);
2892 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002893 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002894
Alex Elder0bed54d2012-07-03 16:01:18 -05002895 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2896 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002897 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002898
Alex Elderd4b125e2012-07-03 16:01:19 -05002899 /* Snapshot name is optional; default is to use "head" */
2900
Alex Elder3feeb8942012-08-31 17:29:52 -05002901 len = next_token(&buf);
Alex Elderd4b125e2012-07-03 16:01:19 -05002902 if (len > RBD_MAX_SNAP_NAME_LEN) {
2903 err_ptr = ERR_PTR(-ENAMETOOLONG);
2904 goto out_err;
2905 }
Alex Elder820a5f32012-07-09 21:04:24 -05002906 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002907 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2908 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002909 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002910 snap_name = kmalloc(len + 1, GFP_KERNEL);
2911 if (!snap_name)
2912 goto out_err;
2913 memcpy(snap_name, buf, len);
2914 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002915
Alex Elder3feeb8942012-08-31 17:29:52 -05002916 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002917
2918out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002919 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002920 rbd_dev->image_name = NULL;
2921 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002922 kfree(rbd_dev->pool_name);
2923 rbd_dev->pool_name = NULL;
2924
Alex Elder3feeb8942012-08-31 17:29:52 -05002925 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002926}
2927
Alex Elder589d30e2012-07-10 20:30:11 -05002928/*
2929 * An rbd format 2 image has a unique identifier, distinct from the
2930 * name given to it by the user. Internally, that identifier is
2931 * what's used to specify the names of objects related to the image.
2932 *
2933 * A special "rbd id" object is used to map an rbd image name to its
2934 * id. If that object doesn't exist, then there is no v2 rbd image
2935 * with the supplied name.
2936 *
2937 * This function will record the given rbd_dev's image_id field if
2938 * it can be determined, and in that case will return 0. If any
2939 * errors occur a negative errno will be returned and the rbd_dev's
2940 * image_id field will be unchanged (and should be NULL).
2941 */
2942static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2943{
2944 int ret;
2945 size_t size;
2946 char *object_name;
2947 void *response;
2948 void *p;
2949
2950 /*
2951 * First, see if the format 2 image id file exists, and if
2952 * so, get the image's persistent id from it.
2953 */
2954 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2955 object_name = kmalloc(size, GFP_NOIO);
2956 if (!object_name)
2957 return -ENOMEM;
2958 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2959 dout("rbd id object name is %s\n", object_name);
2960
2961 /* Response will be an encoded string, which includes a length */
2962
2963 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2964 response = kzalloc(size, GFP_NOIO);
2965 if (!response) {
2966 ret = -ENOMEM;
2967 goto out;
2968 }
2969
2970 ret = rbd_req_sync_exec(rbd_dev, object_name,
2971 "rbd", "get_id",
2972 NULL, 0,
2973 response, RBD_IMAGE_ID_LEN_MAX,
2974 CEPH_OSD_FLAG_READ, NULL);
2975 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2976 if (ret < 0)
2977 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002978 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05002979
2980 p = response;
2981 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2982 p + RBD_IMAGE_ID_LEN_MAX,
2983 &rbd_dev->image_id_len,
2984 GFP_NOIO);
2985 if (IS_ERR(rbd_dev->image_id)) {
2986 ret = PTR_ERR(rbd_dev->image_id);
2987 rbd_dev->image_id = NULL;
2988 } else {
2989 dout("image_id is %s\n", rbd_dev->image_id);
2990 }
2991out:
2992 kfree(response);
2993 kfree(object_name);
2994
2995 return ret;
2996}
2997
Alex Eldera30b71b2012-07-10 20:30:11 -05002998static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2999{
3000 int ret;
3001 size_t size;
3002
3003 /* Version 1 images have no id; empty string is used */
3004
3005 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
3006 if (!rbd_dev->image_id)
3007 return -ENOMEM;
3008 rbd_dev->image_id_len = 0;
3009
3010 /* Record the header object name for this rbd image. */
3011
3012 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
3013 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3014 if (!rbd_dev->header_name) {
3015 ret = -ENOMEM;
3016 goto out_err;
3017 }
3018 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
3019
3020 /* Populate rbd image metadata */
3021
3022 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3023 if (ret < 0)
3024 goto out_err;
3025 rbd_dev->image_format = 1;
3026
3027 dout("discovered version 1 image, header name is %s\n",
3028 rbd_dev->header_name);
3029
3030 return 0;
3031
3032out_err:
3033 kfree(rbd_dev->header_name);
3034 rbd_dev->header_name = NULL;
3035 kfree(rbd_dev->image_id);
3036 rbd_dev->image_id = NULL;
3037
3038 return ret;
3039}
3040
3041static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3042{
3043 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003044 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003045 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003046
3047 /*
3048 * Image id was filled in by the caller. Record the header
3049 * object name for this rbd image.
3050 */
3051 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
3052 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3053 if (!rbd_dev->header_name)
3054 return -ENOMEM;
3055 sprintf(rbd_dev->header_name, "%s%s",
3056 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003057
3058 /* Get the size and object order for the image */
3059
3060 ret = rbd_dev_v2_image_size(rbd_dev);
3061 if (ret < 0)
3062 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003063
3064 /* Get the object prefix (a.k.a. block_name) for the image */
3065
3066 ret = rbd_dev_v2_object_prefix(rbd_dev);
3067 if (ret < 0)
3068 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003069
Alex Elderd8891402012-10-09 13:50:17 -07003070 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003071
3072 ret = rbd_dev_v2_features(rbd_dev);
3073 if (ret < 0)
3074 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003075
Alex Elder6e14b1a2012-07-03 16:01:19 -05003076 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003077
Alex Elder6e14b1a2012-07-03 16:01:19 -05003078 rbd_dev->header.crypt_type = 0;
3079 rbd_dev->header.comp_type = 0;
3080
3081 /* Get the snapshot context, plus the header version */
3082
3083 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003084 if (ret)
3085 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003086 rbd_dev->header.obj_version = ver;
3087
Alex Eldera30b71b2012-07-10 20:30:11 -05003088 rbd_dev->image_format = 2;
3089
3090 dout("discovered version 2 image, header name is %s\n",
3091 rbd_dev->header_name);
3092
Alex Elder35152972012-08-31 17:29:55 -05003093 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003094out_err:
3095 kfree(rbd_dev->header_name);
3096 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003097 kfree(rbd_dev->header.object_prefix);
3098 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003099
3100 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003101}
3102
3103/*
3104 * Probe for the existence of the header object for the given rbd
3105 * device. For format 2 images this includes determining the image
3106 * id.
3107 */
3108static int rbd_dev_probe(struct rbd_device *rbd_dev)
3109{
3110 int ret;
3111
3112 /*
3113 * Get the id from the image id object. If it's not a
3114 * format 2 image, we'll get ENOENT back, and we'll assume
3115 * it's a format 1 image.
3116 */
3117 ret = rbd_dev_image_id(rbd_dev);
3118 if (ret)
3119 ret = rbd_dev_v1_probe(rbd_dev);
3120 else
3121 ret = rbd_dev_v2_probe(rbd_dev);
3122 if (ret)
3123 dout("probe failed, returning %d\n", ret);
3124
3125 return ret;
3126}
3127
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003128static ssize_t rbd_add(struct bus_type *bus,
3129 const char *buf,
3130 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003131{
Alex Eldercb8627c2012-07-09 21:04:23 -05003132 char *options;
3133 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06003134 const char *mon_addrs = NULL;
3135 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06003136 struct ceph_osd_client *osdc;
3137 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05003138 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003139
3140 if (!try_module_get(THIS_MODULE))
3141 return -ENODEV;
3142
Alex Elder27cc2592012-02-02 08:13:30 -06003143 options = kmalloc(count, GFP_KERNEL);
3144 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05003145 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05003146 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3147 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05003148 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003149
3150 /* static rbd_device initialization */
3151 spin_lock_init(&rbd_dev->lock);
3152 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003153 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08003154 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003155
Alex Eldera725f65e2012-02-02 08:13:30 -06003156 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05003157 snap_name = rbd_add_parse_args(rbd_dev, buf,
3158 &mon_addrs, &mon_addrs_size, options, count);
3159 if (IS_ERR(snap_name)) {
3160 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003161 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05003162 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003163
Alex Elderf8c38922012-08-10 13:12:07 -07003164 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3165 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003166 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003167
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003168 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06003169 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003170 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3171 if (rc < 0)
3172 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05003173 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003174
Alex Eldera30b71b2012-07-10 20:30:11 -05003175 rc = rbd_dev_probe(rbd_dev);
3176 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05003177 goto err_out_client;
Alex Elder05fd6f62012-08-29 17:11:07 -05003178
3179 /* no need to lock here, as rbd_dev is not registered yet */
3180 rc = rbd_dev_snaps_update(rbd_dev);
3181 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003182 goto err_out_probe;
Alex Elder05fd6f62012-08-29 17:11:07 -05003183
3184 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3185 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003186 goto err_out_snaps;
Alex Elder05fd6f62012-08-29 17:11:07 -05003187
Alex Elder85ae8922012-07-26 23:37:14 -05003188 /* generate unique id: find highest unique id, add one */
3189 rbd_dev_id_get(rbd_dev);
3190
3191 /* Fill in the device name, now that we have its id. */
3192 BUILD_BUG_ON(DEV_NAME_LEN
3193 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3194 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3195
3196 /* Get our block major device number. */
3197
Alex Elder27cc2592012-02-02 08:13:30 -06003198 rc = register_blkdev(0, rbd_dev->name);
3199 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003200 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003201 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003202
Alex Elder0f308a32012-08-29 17:11:07 -05003203 /* Set up the blkdev mapping. */
3204
3205 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003206 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003207 goto err_out_blkdev;
3208
Alex Elder0f308a32012-08-29 17:11:07 -05003209 rc = rbd_bus_add_dev(rbd_dev);
3210 if (rc)
3211 goto err_out_disk;
3212
Alex Elder32eec682012-02-08 16:11:14 -06003213 /*
3214 * At this point cleanup in the event of an error is the job
3215 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003216 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003217
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003218 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003219 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003220 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003221 if (rc)
3222 goto err_out_bus;
3223
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003224 rc = rbd_init_watch_dev(rbd_dev);
3225 if (rc)
3226 goto err_out_bus;
3227
Alex Elder3ee40012012-08-29 17:11:07 -05003228 /* Everything's ready. Announce the disk to the world. */
3229
3230 add_disk(rbd_dev->disk);
3231
3232 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3233 (unsigned long long) rbd_dev->mapping.size);
3234
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003235 return count;
3236
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003237err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003238 /* this will also clean up rest of rbd_dev stuff */
3239
3240 rbd_bus_del_dev(rbd_dev);
3241 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003242 return rc;
3243
Alex Elder0f308a32012-08-29 17:11:07 -05003244err_out_disk:
3245 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003246err_out_blkdev:
3247 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003248err_out_id:
3249 rbd_dev_id_put(rbd_dev);
Alex Elder41f38c22012-10-25 23:34:40 -05003250err_out_snaps:
3251 rbd_remove_all_snaps(rbd_dev);
3252err_out_probe:
Alex Elder05fd6f62012-08-29 17:11:07 -05003253 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003254err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05003255 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003256 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05003257 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05003258err_out_args:
3259 kfree(rbd_dev->mapping.snap_name);
3260 kfree(rbd_dev->image_name);
3261 kfree(rbd_dev->pool_name);
3262err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06003263 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05003264 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06003265
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003266 dout("Error adding device %s\n", buf);
3267 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003268
3269 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003270}
3271
Alex Elderde71a292012-07-03 16:01:19 -05003272static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003273{
3274 struct list_head *tmp;
3275 struct rbd_device *rbd_dev;
3276
Alex Eldere124a822012-01-29 13:57:44 -06003277 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003278 list_for_each(tmp, &rbd_dev_list) {
3279 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003280 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003281 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003282 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003283 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003284 }
Alex Eldere124a822012-01-29 13:57:44 -06003285 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003286 return NULL;
3287}
3288
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003289static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003290{
Alex Elder593a9e72012-02-07 12:03:37 -06003291 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003292
Alex Elder1dbb4392012-01-24 10:08:37 -06003293 if (rbd_dev->watch_request) {
3294 struct ceph_client *client = rbd_dev->rbd_client->client;
3295
3296 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003297 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003298 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003299 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003300 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003301
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003302 rbd_put_client(rbd_dev);
3303
3304 /* clean up and free blkdev */
3305 rbd_free_disk(rbd_dev);
3306 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003307
Alex Elder2ac4e752012-07-10 20:30:10 -05003308 /* release allocated disk header fields */
3309 rbd_header_free(&rbd_dev->header);
3310
Alex Elder32eec682012-02-08 16:11:14 -06003311 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05003312 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003313 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05003314 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05003315 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05003316 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05003317 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003318 kfree(rbd_dev);
3319
3320 /* release module ref */
3321 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003322}
3323
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003324static ssize_t rbd_remove(struct bus_type *bus,
3325 const char *buf,
3326 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003327{
3328 struct rbd_device *rbd_dev = NULL;
3329 int target_id, rc;
3330 unsigned long ul;
3331 int ret = count;
3332
3333 rc = strict_strtoul(buf, 10, &ul);
3334 if (rc)
3335 return rc;
3336
3337 /* convert to int; abort if we lost anything in the conversion */
3338 target_id = (int) ul;
3339 if (target_id != ul)
3340 return -EINVAL;
3341
3342 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3343
3344 rbd_dev = __rbd_get_dev(target_id);
3345 if (!rbd_dev) {
3346 ret = -ENOENT;
3347 goto done;
3348 }
3349
Alex Elder41f38c22012-10-25 23:34:40 -05003350 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003351 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003352
3353done:
3354 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05003355
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003356 return ret;
3357}
3358
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003359/*
3360 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003361 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003362 */
3363static int rbd_sysfs_init(void)
3364{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003365 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003366
Alex Elderfed4c142012-02-07 12:03:36 -06003367 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003368 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003369 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003370
Alex Elderfed4c142012-02-07 12:03:36 -06003371 ret = bus_register(&rbd_bus_type);
3372 if (ret < 0)
3373 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003374
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003375 return ret;
3376}
3377
3378static void rbd_sysfs_cleanup(void)
3379{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003380 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003381 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003382}
3383
3384int __init rbd_init(void)
3385{
3386 int rc;
3387
3388 rc = rbd_sysfs_init();
3389 if (rc)
3390 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003391 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003392 return 0;
3393}
3394
3395void __exit rbd_exit(void)
3396{
3397 rbd_sysfs_cleanup();
3398}
3399
3400module_init(rbd_init);
3401module_exit(rbd_exit);
3402
3403MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3404MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3405MODULE_DESCRIPTION("rados block device");
3406
3407/* following authorship retained from original osdblk.c */
3408MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3409
3410MODULE_LICENSE("GPL");