blob: 4858d925b95e5f97d4afe870d822fbd9895ab2cc [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050075
Alex Elderd8891402012-10-09 13:50:17 -070076/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
Alex Elder81a89792012-02-02 08:13:30 -060084/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060091#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Eldercc0538b2012-08-10 13:12:07 -070093#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070094
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
112 u64 obj_version;
113};
114
115struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700116 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117};
118
119/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600120 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600129 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700130 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145};
146
Alex Elderf0f8cef2012-01-29 13:57:44 -0600147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800159struct rbd_snap {
160 struct device dev;
161 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800162 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800163 struct list_head node;
164 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500165 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166};
167
Alex Elderf84344f2012-08-31 17:29:51 -0500168struct rbd_mapping {
169 char *snap_name;
170 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500171 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500172 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500173 bool snap_exists;
174 bool read_only;
175};
176
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700177/*
178 * a single device
179 */
180struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500181 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
183 int major; /* blkdev assigned major */
184 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700185
Alex Eldera30b71b2012-07-10 20:30:11 -0500186 u32 image_format; /* Either 1 or 2 */
Alex Elderf8c38922012-08-10 13:12:07 -0700187 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188 struct rbd_client *rbd_client;
189
190 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
191
192 spinlock_t lock; /* queue lock */
193
194 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500195 char *image_id;
196 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500197 char *image_name;
198 size_t image_name_len;
199 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500200 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500201 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700202
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700203 struct ceph_osd_event *watch_event;
204 struct ceph_osd_request *watch_request;
205
Josh Durginc6666012011-11-21 17:11:12 -0800206 /* protects updating the header */
207 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500208
209 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700210
211 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800212
213 /* list of snapshots */
214 struct list_head snaps;
215
216 /* sysfs related */
217 struct device dev;
218};
219
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700220static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600221
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700222static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600223static DEFINE_SPINLOCK(rbd_dev_list_lock);
224
Alex Elder432b8582012-01-29 13:57:44 -0600225static LIST_HEAD(rbd_client_list); /* clients */
226static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700227
Alex Elder304f6802012-08-31 17:29:52 -0500228static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
229static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231static void rbd_dev_release(struct device *dev);
Alex Elder14e70852012-07-19 09:09:27 -0500232static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233
Alex Elderf0f8cef2012-01-29 13:57:44 -0600234static ssize_t rbd_add(struct bus_type *bus, const char *buf,
235 size_t count);
236static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
237 size_t count);
238
239static struct bus_attribute rbd_bus_attrs[] = {
240 __ATTR(add, S_IWUSR, NULL, rbd_add),
241 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
242 __ATTR_NULL
243};
244
245static struct bus_type rbd_bus_type = {
246 .name = "rbd",
247 .bus_attrs = rbd_bus_attrs,
248};
249
250static void rbd_root_dev_release(struct device *dev)
251{
252}
253
254static struct device rbd_root_dev = {
255 .init_name = "rbd",
256 .release = rbd_root_dev_release,
257};
258
Alex Elderaafb2302012-09-06 16:00:54 -0500259#ifdef RBD_DEBUG
260#define rbd_assert(expr) \
261 if (unlikely(!(expr))) { \
262 printk(KERN_ERR "\nAssertion failure in %s() " \
263 "at line %d:\n\n" \
264 "\trbd_assert(%s);\n\n", \
265 __func__, __LINE__, #expr); \
266 BUG(); \
267 }
268#else /* !RBD_DEBUG */
269# define rbd_assert(expr) ((void) 0)
270#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800272static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
273{
274 return get_device(&rbd_dev->dev);
275}
276
277static void rbd_put_dev(struct rbd_device *rbd_dev)
278{
279 put_device(&rbd_dev->dev);
280}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281
Alex Elder117973f2012-08-31 17:29:55 -0500282static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
283static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700284
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700285static int rbd_open(struct block_device *bdev, fmode_t mode)
286{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600287 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700288
Alex Elderf84344f2012-08-31 17:29:51 -0500289 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700290 return -EROFS;
291
Alex Elder340c7a22012-08-10 13:12:07 -0700292 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500293 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 return 0;
296}
297
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800298static int rbd_release(struct gendisk *disk, fmode_t mode)
299{
300 struct rbd_device *rbd_dev = disk->private_data;
301
302 rbd_put_dev(rbd_dev);
303
304 return 0;
305}
306
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307static const struct block_device_operations rbd_bd_ops = {
308 .owner = THIS_MODULE,
309 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800310 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700311};
312
313/*
314 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500315 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316 */
Alex Elderf8c38922012-08-10 13:12:07 -0700317static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318{
319 struct rbd_client *rbdc;
320 int ret = -ENOMEM;
321
322 dout("rbd_client_create\n");
323 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
324 if (!rbdc)
325 goto out_opt;
326
327 kref_init(&rbdc->kref);
328 INIT_LIST_HEAD(&rbdc->node);
329
Alex Elderbc534d82012-01-29 13:57:44 -0600330 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
331
Alex Elder43ae4702012-07-03 16:01:18 -0500332 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600334 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500335 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336
337 ret = ceph_open_session(rbdc->client);
338 if (ret < 0)
339 goto out_err;
340
Alex Elder432b8582012-01-29 13:57:44 -0600341 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700342 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600343 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
Alex Elderbc534d82012-01-29 13:57:44 -0600345 mutex_unlock(&ctl_mutex);
346
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347 dout("rbd_client_create created %p\n", rbdc);
348 return rbdc;
349
350out_err:
351 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600352out_mutex:
353 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354 kfree(rbdc);
355out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500356 if (ceph_opts)
357 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400358 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700359}
360
361/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700362 * Find a ceph client with specific addr and configuration. If
363 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700365static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700366{
367 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700368 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369
Alex Elder43ae4702012-07-03 16:01:18 -0500370 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371 return NULL;
372
Alex Elder1f7ba332012-08-10 13:12:07 -0700373 spin_lock(&rbd_client_list_lock);
374 list_for_each_entry(client_node, &rbd_client_list, node) {
375 if (!ceph_compare_options(ceph_opts, client_node->client)) {
376 kref_get(&client_node->kref);
377 found = true;
378 break;
379 }
380 }
381 spin_unlock(&rbd_client_list_lock);
382
383 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700384}
385
386/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700387 * mount options
388 */
389enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700390 Opt_last_int,
391 /* int args above */
392 Opt_last_string,
393 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700394 Opt_read_only,
395 Opt_read_write,
396 /* Boolean args above */
397 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398};
399
Alex Elder43ae4702012-07-03 16:01:18 -0500400static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700401 /* int args above */
402 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500403 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700404 {Opt_read_only, "ro"}, /* Alternate spelling */
405 {Opt_read_write, "read_write"},
406 {Opt_read_write, "rw"}, /* Alternate spelling */
407 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408 {-1, NULL}
409};
410
411static int parse_rbd_opts_token(char *c, void *private)
412{
Alex Elder43ae4702012-07-03 16:01:18 -0500413 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700414 substring_t argstr[MAX_OPT_ARGS];
415 int token, intval, ret;
416
Alex Elder43ae4702012-07-03 16:01:18 -0500417 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700418 if (token < 0)
419 return -EINVAL;
420
421 if (token < Opt_last_int) {
422 ret = match_int(&argstr[0], &intval);
423 if (ret < 0) {
424 pr_err("bad mount option arg (not int) "
425 "at '%s'\n", c);
426 return ret;
427 }
428 dout("got int token %d val %d\n", token, intval);
429 } else if (token > Opt_last_int && token < Opt_last_string) {
430 dout("got string token %d val %s\n", token,
431 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700432 } else if (token > Opt_last_string && token < Opt_last_bool) {
433 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434 } else {
435 dout("got token %d\n", token);
436 }
437
438 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700439 case Opt_read_only:
440 rbd_opts->read_only = true;
441 break;
442 case Opt_read_write:
443 rbd_opts->read_only = false;
444 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700445 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500446 rbd_assert(false);
447 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700448 }
449 return 0;
450}
451
452/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 * Get a ceph client with specific addr and configuration, if one does
454 * not exist create it.
455 */
Alex Elderf8c38922012-08-10 13:12:07 -0700456static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
457 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458{
Alex Elderf8c38922012-08-10 13:12:07 -0700459 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500460 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700461 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700462
Alex Eldercc0538b2012-08-10 13:12:07 -0700463 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700464
Alex Elder43ae4702012-07-03 16:01:18 -0500465 ceph_opts = ceph_parse_options(options, mon_addr,
466 mon_addr + mon_addr_len,
467 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700468 if (IS_ERR(ceph_opts))
469 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470
Alex Elder1f7ba332012-08-10 13:12:07 -0700471 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700472 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600473 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500474 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700475 } else {
476 rbdc = rbd_client_create(ceph_opts);
477 if (IS_ERR(rbdc))
478 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700479 }
Alex Elderf8c38922012-08-10 13:12:07 -0700480 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481
Alex Elderf8c38922012-08-10 13:12:07 -0700482 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483}
484
485/*
486 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600487 *
Alex Elder432b8582012-01-29 13:57:44 -0600488 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700489 */
490static void rbd_client_release(struct kref *kref)
491{
492 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
493
494 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500495 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700496 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500497 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700498
499 ceph_destroy_client(rbdc->client);
500 kfree(rbdc);
501}
502
503/*
504 * Drop reference to ceph client node. If it's not referenced anymore, release
505 * it.
506 */
507static void rbd_put_client(struct rbd_device *rbd_dev)
508{
509 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
510 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700511}
512
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700513/*
514 * Destroy requests collection
515 */
516static void rbd_coll_release(struct kref *kref)
517{
518 struct rbd_req_coll *coll =
519 container_of(kref, struct rbd_req_coll, kref);
520
521 dout("rbd_coll_release %p\n", coll);
522 kfree(coll);
523}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700524
Alex Eldera30b71b2012-07-10 20:30:11 -0500525static bool rbd_image_format_valid(u32 image_format)
526{
527 return image_format == 1 || image_format == 2;
528}
529
Alex Elder8e94af82012-07-25 09:32:40 -0500530static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
531{
Alex Elder103a1502012-08-02 11:29:45 -0500532 size_t size;
533 u32 snap_count;
534
535 /* The header has to start with the magic rbd header text */
536 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
537 return false;
538
Alex Elderdb2388b2012-10-20 22:17:27 -0500539 /* The bio layer requires at least sector-sized I/O */
540
541 if (ondisk->options.order < SECTOR_SHIFT)
542 return false;
543
544 /* If we use u64 in a few spots we may be able to loosen this */
545
546 if (ondisk->options.order > 8 * sizeof (int) - 1)
547 return false;
548
Alex Elder103a1502012-08-02 11:29:45 -0500549 /*
550 * The size of a snapshot header has to fit in a size_t, and
551 * that limits the number of snapshots.
552 */
553 snap_count = le32_to_cpu(ondisk->snap_count);
554 size = SIZE_MAX - sizeof (struct ceph_snap_context);
555 if (snap_count > size / sizeof (__le64))
556 return false;
557
558 /*
559 * Not only that, but the size of the entire the snapshot
560 * header must also be representable in a size_t.
561 */
562 size -= snap_count * sizeof (__le64);
563 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
564 return false;
565
566 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500567}
568
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700569/*
570 * Create a new header structure, translate header format from the on-disk
571 * header.
572 */
573static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500574 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575{
Alex Elderccece232012-07-10 20:30:10 -0500576 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500577 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500578 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500579 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700580
Alex Elder6a523252012-07-19 17:12:59 -0500581 memset(header, 0, sizeof (*header));
582
Alex Elder103a1502012-08-02 11:29:45 -0500583 snap_count = le32_to_cpu(ondisk->snap_count);
584
Alex Elder58c17b02012-08-23 23:22:06 -0500585 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
586 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500587 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700588 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500589 memcpy(header->object_prefix, ondisk->object_prefix, len);
590 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600591
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500593 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
594
Alex Elder621901d2012-08-23 23:22:06 -0500595 /* Save a copy of the snapshot names */
596
Alex Elderf785cc12012-08-23 23:22:06 -0500597 if (snap_names_len > (u64) SIZE_MAX)
598 return -EIO;
599 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500601 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500602 /*
603 * Note that rbd_dev_v1_header_read() guarantees
604 * the ondisk buffer we're working with has
605 * snap_names_len bytes beyond the end of the
606 * snapshot id array, this memcpy() is safe.
607 */
608 memcpy(header->snap_names, &ondisk->snaps[snap_count],
609 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500610
Alex Elder621901d2012-08-23 23:22:06 -0500611 /* Record each snapshot's size */
612
Alex Elderd2bb24e2012-07-26 23:37:14 -0500613 size = snap_count * sizeof (*header->snap_sizes);
614 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700615 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500616 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500617 for (i = 0; i < snap_count; i++)
618 header->snap_sizes[i] =
619 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700620 } else {
Alex Elderccece232012-07-10 20:30:10 -0500621 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622 header->snap_names = NULL;
623 header->snap_sizes = NULL;
624 }
Alex Elder849b4262012-07-09 21:04:24 -0500625
Alex Elder34b13182012-07-13 20:35:12 -0500626 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700627 header->obj_order = ondisk->options.order;
628 header->crypt_type = ondisk->options.crypt_type;
629 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500630
Alex Elder621901d2012-08-23 23:22:06 -0500631 /* Allocate and fill in the snapshot context */
632
Alex Elderf84344f2012-08-31 17:29:51 -0500633 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500634 size = sizeof (struct ceph_snap_context);
635 size += snap_count * sizeof (header->snapc->snaps[0]);
636 header->snapc = kzalloc(size, GFP_KERNEL);
637 if (!header->snapc)
638 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639
640 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500641 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500643 for (i = 0; i < snap_count; i++)
644 header->snapc->snaps[i] =
645 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646
647 return 0;
648
Alex Elder6a523252012-07-19 17:12:59 -0500649out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500650 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500651 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500653 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500654 kfree(header->object_prefix);
655 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500656
Alex Elder00f1f362012-02-07 12:03:36 -0600657 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658}
659
Alex Elder8836b992012-08-30 14:42:15 -0500660static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700661{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662
Alex Eldere86924a2012-07-10 20:30:11 -0500663 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600664
Alex Eldere86924a2012-07-10 20:30:11 -0500665 list_for_each_entry(snap, &rbd_dev->snaps, node) {
666 if (!strcmp(snap_name, snap->name)) {
667 rbd_dev->mapping.snap_id = snap->id;
668 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500669 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600670
Alex Eldere86924a2012-07-10 20:30:11 -0500671 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600672 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673 }
Alex Eldere86924a2012-07-10 20:30:11 -0500674
Alex Elder00f1f362012-02-07 12:03:36 -0600675 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700676}
677
Alex Elder5ed16172012-08-29 17:11:07 -0500678static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679{
Alex Elder78dc4472012-07-19 08:49:18 -0500680 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681
Alex Elder4e1105a2012-08-31 17:29:52 -0500682 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800683 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500684 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500685 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500686 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500687 rbd_dev->mapping.snap_exists = false;
688 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Alex Eldere86924a2012-07-10 20:30:11 -0500689 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500691 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 if (ret < 0)
693 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500694 rbd_dev->mapping.snap_exists = true;
695 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500697 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700698done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 return ret;
700}
701
702static void rbd_header_free(struct rbd_image_header *header)
703{
Alex Elder849b4262012-07-09 21:04:24 -0500704 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500705 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500707 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500708 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500709 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800710 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500711 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712}
713
Alex Elder65ccfe22012-08-09 10:33:26 -0700714static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700715{
Alex Elder65ccfe22012-08-09 10:33:26 -0700716 char *name;
717 u64 segment;
718 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719
Alex Elder65ccfe22012-08-09 10:33:26 -0700720 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
721 if (!name)
722 return NULL;
723 segment = offset >> rbd_dev->header.obj_order;
724 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
725 rbd_dev->header.object_prefix, segment);
726 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
727 pr_err("error formatting segment name for #%llu (%d)\n",
728 segment, ret);
729 kfree(name);
730 name = NULL;
731 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700732
Alex Elder65ccfe22012-08-09 10:33:26 -0700733 return name;
734}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735
Alex Elder65ccfe22012-08-09 10:33:26 -0700736static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
737{
738 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700739
Alex Elder65ccfe22012-08-09 10:33:26 -0700740 return offset & (segment_size - 1);
741}
742
743static u64 rbd_segment_length(struct rbd_device *rbd_dev,
744 u64 offset, u64 length)
745{
746 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
747
748 offset &= segment_size - 1;
749
Alex Elderaafb2302012-09-06 16:00:54 -0500750 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700751 if (offset + length > segment_size)
752 length = segment_size - offset;
753
754 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700755}
756
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700757static int rbd_get_num_segments(struct rbd_image_header *header,
758 u64 ofs, u64 len)
759{
Alex Elderdf111be2012-08-09 10:33:26 -0700760 u64 start_seg;
761 u64 end_seg;
762
763 if (!len)
764 return 0;
765 if (len - 1 > U64_MAX - ofs)
766 return -ERANGE;
767
768 start_seg = ofs >> header->obj_order;
769 end_seg = (ofs + len - 1) >> header->obj_order;
770
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700771 return end_seg - start_seg + 1;
772}
773
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700774/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700775 * returns the size of an object in the image
776 */
777static u64 rbd_obj_bytes(struct rbd_image_header *header)
778{
779 return 1 << header->obj_order;
780}
781
782/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700783 * bio helpers
784 */
785
786static void bio_chain_put(struct bio *chain)
787{
788 struct bio *tmp;
789
790 while (chain) {
791 tmp = chain;
792 chain = chain->bi_next;
793 bio_put(tmp);
794 }
795}
796
797/*
798 * zeros a bio chain, starting at specific offset
799 */
800static void zero_bio_chain(struct bio *chain, int start_ofs)
801{
802 struct bio_vec *bv;
803 unsigned long flags;
804 void *buf;
805 int i;
806 int pos = 0;
807
808 while (chain) {
809 bio_for_each_segment(bv, chain, i) {
810 if (pos + bv->bv_len > start_ofs) {
811 int remainder = max(start_ofs - pos, 0);
812 buf = bvec_kmap_irq(bv, &flags);
813 memset(buf + remainder, 0,
814 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200815 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700816 }
817 pos += bv->bv_len;
818 }
819
820 chain = chain->bi_next;
821 }
822}
823
824/*
825 * bio_chain_clone - clone a chain of bios up to a certain length.
826 * might return a bio_pair that will need to be released.
827 */
828static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
829 struct bio_pair **bp,
830 int len, gfp_t gfpmask)
831{
Alex Elder542582f2012-08-09 10:33:25 -0700832 struct bio *old_chain = *old;
833 struct bio *new_chain = NULL;
834 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700835 int total = 0;
836
837 if (*bp) {
838 bio_pair_release(*bp);
839 *bp = NULL;
840 }
841
842 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700843 struct bio *tmp;
844
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
846 if (!tmp)
847 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700848 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700849
850 if (total + old_chain->bi_size > len) {
851 struct bio_pair *bp;
852
853 /*
854 * this split can only happen with a single paged bio,
855 * split_bio will BUG_ON if this is not the case
856 */
857 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500858 "bi_size=%u\n",
859 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700860
861 /* split the bio. We'll release it either in the next
862 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600863 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700864 if (!bp)
865 goto err_out;
866
867 __bio_clone(tmp, &bp->bio1);
868
869 *next = &bp->bio2;
870 } else {
871 __bio_clone(tmp, old_chain);
872 *next = old_chain->bi_next;
873 }
874
875 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700877 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700879 else
880 new_chain = tmp;
881 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882 old_chain = old_chain->bi_next;
883
884 total += tmp->bi_size;
885 }
886
Alex Elderaafb2302012-09-06 16:00:54 -0500887 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889 *old = old_chain;
890
891 return new_chain;
892
893err_out:
894 dout("bio_chain_clone with err\n");
895 bio_chain_put(new_chain);
896 return NULL;
897}
898
899/*
900 * helpers for osd request op vectors.
901 */
Alex Elder57cfc102012-06-26 12:57:03 -0700902static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
903 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700904{
Alex Elder57cfc102012-06-26 12:57:03 -0700905 struct ceph_osd_req_op *ops;
906
907 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
908 if (!ops)
909 return NULL;
910
911 ops[0].op = opcode;
912
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700913 /*
914 * op extent offset and length will be set later on
915 * in calc_raw_layout()
916 */
Alex Elder57cfc102012-06-26 12:57:03 -0700917 ops[0].payload_len = payload_len;
918
919 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700920}
921
922static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
923{
924 kfree(ops);
925}
926
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700927static void rbd_coll_end_req_index(struct request *rq,
928 struct rbd_req_coll *coll,
929 int index,
930 int ret, u64 len)
931{
932 struct request_queue *q;
933 int min, max, i;
934
Alex Elderbd919d42012-07-13 20:35:11 -0500935 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
936 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700937
938 if (!rq)
939 return;
940
941 if (!coll) {
942 blk_end_request(rq, ret, len);
943 return;
944 }
945
946 q = rq->q;
947
948 spin_lock_irq(q->queue_lock);
949 coll->status[index].done = 1;
950 coll->status[index].rc = ret;
951 coll->status[index].bytes = len;
952 max = min = coll->num_done;
953 while (max < coll->total && coll->status[max].done)
954 max++;
955
956 for (i = min; i<max; i++) {
957 __blk_end_request(rq, coll->status[i].rc,
958 coll->status[i].bytes);
959 coll->num_done++;
960 kref_put(&coll->kref, rbd_coll_release);
961 }
962 spin_unlock_irq(q->queue_lock);
963}
964
965static void rbd_coll_end_req(struct rbd_request *req,
966 int ret, u64 len)
967{
968 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
969}
970
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700971/*
972 * Send ceph osd request
973 */
974static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500975 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700976 struct ceph_snap_context *snapc,
977 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500978 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700979 struct bio *bio,
980 struct page **pages,
981 int num_pages,
982 int flags,
983 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700984 struct rbd_req_coll *coll,
985 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700986 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700987 struct ceph_msg *msg),
988 struct ceph_osd_request **linger_req,
989 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700990{
991 struct ceph_osd_request *req;
992 struct ceph_file_layout *layout;
993 int ret;
994 u64 bno;
995 struct timespec mtime = CURRENT_TIME;
996 struct rbd_request *req_data;
997 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600998 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001001 if (!req_data) {
1002 if (coll)
1003 rbd_coll_end_req_index(rq, coll, coll_index,
1004 -ENOMEM, len);
1005 return -ENOMEM;
1006 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001007
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001008 if (coll) {
1009 req_data->coll = coll;
1010 req_data->coll_index = coll_index;
1011 }
1012
Alex Elderbd919d42012-07-13 20:35:11 -05001013 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
1014 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015
Alex Elder0ce1a792012-07-03 16:01:18 -05001016 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001017 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1018 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001019 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001020 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001021 goto done_pages;
1022 }
1023
1024 req->r_callback = rbd_cb;
1025
1026 req_data->rq = rq;
1027 req_data->bio = bio;
1028 req_data->pages = pages;
1029 req_data->len = len;
1030
1031 req->r_priv = req_data;
1032
1033 reqhead = req->r_request->front.iov_base;
1034 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1035
Alex Elderaded07e2012-07-03 16:01:18 -05001036 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001037 req->r_oid_len = strlen(req->r_oid);
1038
1039 layout = &req->r_file_layout;
1040 memset(layout, 0, sizeof(*layout));
1041 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1042 layout->fl_stripe_count = cpu_to_le32(1);
1043 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001044 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001045 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1046 req, ops);
1047 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001048
1049 ceph_osdc_build_request(req, ofs, &len,
1050 ops,
1051 snapc,
1052 &mtime,
1053 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001055 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001056 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001057 *linger_req = req;
1058 }
1059
Alex Elder1dbb4392012-01-24 10:08:37 -06001060 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001061 if (ret < 0)
1062 goto done_err;
1063
1064 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001065 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001066 if (ver)
1067 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001068 dout("reassert_ver=%llu\n",
1069 (unsigned long long)
1070 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001071 ceph_osdc_put_request(req);
1072 }
1073 return ret;
1074
1075done_err:
1076 bio_chain_put(req_data->bio);
1077 ceph_osdc_put_request(req);
1078done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001079 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001081 return ret;
1082}
1083
1084/*
1085 * Ceph osd op callback
1086 */
1087static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1088{
1089 struct rbd_request *req_data = req->r_priv;
1090 struct ceph_osd_reply_head *replyhead;
1091 struct ceph_osd_op *op;
1092 __s32 rc;
1093 u64 bytes;
1094 int read_op;
1095
1096 /* parse reply */
1097 replyhead = msg->front.iov_base;
1098 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1099 op = (void *)(replyhead + 1);
1100 rc = le32_to_cpu(replyhead->result);
1101 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001102 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103
Alex Elderbd919d42012-07-13 20:35:11 -05001104 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1105 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106
1107 if (rc == -ENOENT && read_op) {
1108 zero_bio_chain(req_data->bio, 0);
1109 rc = 0;
1110 } else if (rc == 0 && read_op && bytes < req_data->len) {
1111 zero_bio_chain(req_data->bio, bytes);
1112 bytes = req_data->len;
1113 }
1114
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001115 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001116
1117 if (req_data->bio)
1118 bio_chain_put(req_data->bio);
1119
1120 ceph_osdc_put_request(req);
1121 kfree(req_data);
1122}
1123
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001124static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1125{
1126 ceph_osdc_put_request(req);
1127}
1128
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129/*
1130 * Do a synchronous ceph osd operation
1131 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001132static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133 struct ceph_snap_context *snapc,
1134 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001136 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001137 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001138 u64 ofs, u64 inbound_size,
1139 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001140 struct ceph_osd_request **linger_req,
1141 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001142{
1143 int ret;
1144 struct page **pages;
1145 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001146
Alex Elderaafb2302012-09-06 16:00:54 -05001147 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148
Alex Elderf8d4de62012-07-03 16:01:19 -05001149 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001150 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001151 if (IS_ERR(pages))
1152 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153
Alex Elder0ce1a792012-07-03 16:01:18 -05001154 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001155 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001156 pages, num_pages,
1157 flags,
1158 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001159 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001160 NULL,
1161 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001163 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164
Alex Elderf8d4de62012-07-03 16:01:19 -05001165 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1166 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168done:
1169 ceph_release_page_vector(pages, num_pages);
1170 return ret;
1171}
1172
1173/*
1174 * Do an asynchronous ceph osd operation
1175 */
1176static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001177 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001180 struct bio *bio,
1181 struct rbd_req_coll *coll,
1182 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001183{
1184 char *seg_name;
1185 u64 seg_ofs;
1186 u64 seg_len;
1187 int ret;
1188 struct ceph_osd_req_op *ops;
1189 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001190 int opcode;
1191 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001192 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001193
Alex Elder65ccfe22012-08-09 10:33:26 -07001194 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001195 if (!seg_name)
1196 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001197 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1198 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001199
Alex Elderff2e4bb2012-10-10 18:59:29 -07001200 if (rq_data_dir(rq) == WRITE) {
1201 opcode = CEPH_OSD_OP_WRITE;
1202 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001203 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001204 payload_len = seg_len;
1205 } else {
1206 opcode = CEPH_OSD_OP_READ;
1207 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001208 snapc = NULL;
1209 snapid = rbd_dev->mapping.snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001210 payload_len = 0;
1211 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212
Alex Elder57cfc102012-06-26 12:57:03 -07001213 ret = -ENOMEM;
1214 ops = rbd_create_rw_ops(1, opcode, payload_len);
1215 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001216 goto done;
1217
1218 /* we've taken care of segment sizes earlier when we
1219 cloned the bios. We should never have a segment
1220 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001221 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222
1223 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1224 seg_name, seg_ofs, seg_len,
1225 bio,
1226 NULL, 0,
1227 flags,
1228 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001229 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001231
1232 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001233done:
1234 kfree(seg_name);
1235 return ret;
1236}
1237
1238/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239 * Request sync osd read
1240 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001241static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001243 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001244 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001245 char *buf,
1246 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001247{
Alex Elder913d2fd2012-06-26 12:57:03 -07001248 struct ceph_osd_req_op *ops;
1249 int ret;
1250
1251 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1252 if (!ops)
1253 return -ENOMEM;
1254
1255 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001256 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001257 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001258 ops, object_name, ofs, len, buf, NULL, ver);
1259 rbd_destroy_ops(ops);
1260
1261 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001262}
1263
1264/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001265 * Request sync osd watch
1266 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001267static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001268 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001269 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001270{
1271 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001272 int ret;
1273
Alex Elder57cfc102012-06-26 12:57:03 -07001274 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1275 if (!ops)
1276 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001277
Josh Durgina71b8912011-12-05 18:10:44 -08001278 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001279 ops[0].watch.cookie = notify_id;
1280 ops[0].watch.flag = 0;
1281
Alex Elder0ce1a792012-07-03 16:01:18 -05001282 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001283 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001284 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001285 CEPH_OSD_FLAG_READ,
1286 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001287 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001288 rbd_simple_req_cb, 0, NULL);
1289
1290 rbd_destroy_ops(ops);
1291 return ret;
1292}
1293
1294static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1295{
Alex Elder0ce1a792012-07-03 16:01:18 -05001296 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001297 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001298 int rc;
1299
Alex Elder0ce1a792012-07-03 16:01:18 -05001300 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001301 return;
1302
Alex Elderbd919d42012-07-13 20:35:11 -05001303 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1304 rbd_dev->header_name, (unsigned long long) notify_id,
1305 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001306 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001307 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001308 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001309 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310
Alex Elder7f0a24d2012-07-25 09:32:40 -05001311 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001312}
1313
1314/*
1315 * Request sync osd watch
1316 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001317static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001318{
1319 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001320 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001321 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001322
Alex Elder57cfc102012-06-26 12:57:03 -07001323 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1324 if (!ops)
1325 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001326
1327 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001328 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001329 if (ret < 0)
1330 goto fail;
1331
Alex Elder0e6f3222012-07-25 09:32:40 -05001332 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001333 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001334 ops[0].watch.flag = 1;
1335
Alex Elder0ce1a792012-07-03 16:01:18 -05001336 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001337 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1339 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001340 rbd_dev->header_name,
1341 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001342 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343
1344 if (ret < 0)
1345 goto fail_event;
1346
1347 rbd_destroy_ops(ops);
1348 return 0;
1349
1350fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001351 ceph_osdc_cancel_event(rbd_dev->watch_event);
1352 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001353fail:
1354 rbd_destroy_ops(ops);
1355 return ret;
1356}
1357
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001358/*
1359 * Request sync osd unwatch
1360 */
Alex Elder070c6332012-07-25 09:32:41 -05001361static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001362{
1363 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001364 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001365
Alex Elder57cfc102012-06-26 12:57:03 -07001366 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1367 if (!ops)
1368 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001369
1370 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001371 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001372 ops[0].watch.flag = 0;
1373
Alex Elder0ce1a792012-07-03 16:01:18 -05001374 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001375 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001376 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1377 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001378 rbd_dev->header_name,
1379 0, 0, NULL, NULL, NULL);
1380
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001381
1382 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001383 ceph_osdc_cancel_event(rbd_dev->watch_event);
1384 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001385 return ret;
1386}
1387
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001388/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001389 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001390 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001391static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001392 const char *object_name,
1393 const char *class_name,
1394 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001395 const char *outbound,
1396 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001397 char *inbound,
1398 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001399 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001400 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001401{
1402 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001403 int class_name_len = strlen(class_name);
1404 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001405 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001406 int ret;
1407
Alex Elder3cb4a682012-06-26 12:57:03 -07001408 /*
1409 * Any input parameters required by the method we're calling
1410 * will be sent along with the class and method names as
1411 * part of the message payload. That data and its size are
1412 * supplied via the indata and indata_len fields (named from
1413 * the perspective of the server side) in the OSD request
1414 * operation.
1415 */
1416 payload_size = class_name_len + method_name_len + outbound_size;
1417 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001418 if (!ops)
1419 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001420
Alex Elderaded07e2012-07-03 16:01:18 -05001421 ops[0].cls.class_name = class_name;
1422 ops[0].cls.class_len = (__u8) class_name_len;
1423 ops[0].cls.method_name = method_name;
1424 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001425 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001426 ops[0].cls.indata = outbound;
1427 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001428
Alex Elder0ce1a792012-07-03 16:01:18 -05001429 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001430 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001431 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001432 object_name, 0, inbound_size, inbound,
1433 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001434
1435 rbd_destroy_ops(ops);
1436
1437 dout("cls_exec returned %d\n", ret);
1438 return ret;
1439}
1440
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001441static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1442{
1443 struct rbd_req_coll *coll =
1444 kzalloc(sizeof(struct rbd_req_coll) +
1445 sizeof(struct rbd_req_status) * num_reqs,
1446 GFP_ATOMIC);
1447
1448 if (!coll)
1449 return NULL;
1450 coll->total = num_reqs;
1451 kref_init(&coll->kref);
1452 return coll;
1453}
1454
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001455/*
1456 * block device queue callback
1457 */
1458static void rbd_rq_fn(struct request_queue *q)
1459{
1460 struct rbd_device *rbd_dev = q->queuedata;
1461 struct request *rq;
1462 struct bio_pair *bp = NULL;
1463
Alex Elder00f1f362012-02-07 12:03:36 -06001464 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001465 struct bio *bio;
1466 struct bio *rq_bio, *next_bio = NULL;
1467 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001468 unsigned int size;
1469 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001470 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001471 int num_segs, cur_seg = 0;
1472 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001473 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001475 dout("fetched request\n");
1476
1477 /* filter out block requests we don't understand */
1478 if ((rq->cmd_type != REQ_TYPE_FS)) {
1479 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001480 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001481 }
1482
1483 /* deduce our operation (read, write) */
1484 do_write = (rq_data_dir(rq) == WRITE);
1485
1486 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001487 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001488 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001489 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001491 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001492 }
1493
1494 spin_unlock_irq(q->queue_lock);
1495
Josh Durgind1d25642011-12-05 14:03:05 -08001496 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001497
Alex Elderf84344f2012-08-31 17:29:51 -05001498 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1499 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001500 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001501 dout("request for non-existent snapshot");
1502 spin_lock_irq(q->queue_lock);
1503 __blk_end_request_all(rq, -ENXIO);
1504 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001505 }
1506
Josh Durgind1d25642011-12-05 14:03:05 -08001507 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1508
1509 up_read(&rbd_dev->header_rwsem);
1510
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511 dout("%s 0x%x bytes at 0x%llx\n",
1512 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001513 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001514
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001515 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001516 if (num_segs <= 0) {
1517 spin_lock_irq(q->queue_lock);
1518 __blk_end_request_all(rq, num_segs);
1519 ceph_put_snap_context(snapc);
1520 continue;
1521 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001522 coll = rbd_alloc_coll(num_segs);
1523 if (!coll) {
1524 spin_lock_irq(q->queue_lock);
1525 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001526 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001527 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001528 }
1529
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 do {
1531 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001532 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001533 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001534 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1536 op_size, GFP_ATOMIC);
Alex Elder46342462012-10-10 18:59:29 -07001537 if (bio)
1538 (void) rbd_do_op(rq, rbd_dev, snapc,
1539 ofs, op_size,
1540 bio, coll, cur_seg);
1541 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001542 rbd_coll_end_req_index(rq, coll, cur_seg,
1543 -ENOMEM, op_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001544 size -= op_size;
1545 ofs += op_size;
1546
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001547 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 rq_bio = next_bio;
1549 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001550 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001551
1552 if (bp)
1553 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001554 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001555
1556 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001557 }
1558}
1559
1560/*
1561 * a queue callback. Makes sure that we don't create a bio that spans across
1562 * multiple osd objects. One exception would be with a single page bios,
1563 * which we handle later at bio_chain_clone
1564 */
1565static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1566 struct bio_vec *bvec)
1567{
1568 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001569 unsigned int chunk_sectors;
1570 sector_t sector;
1571 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001572 int max;
1573
Alex Elder593a9e72012-02-07 12:03:37 -06001574 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1575 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1576 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1577
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001578 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001579 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001580 if (max < 0)
1581 max = 0; /* bio_add cannot handle a negative return */
1582 if (max <= bvec->bv_len && bio_sectors == 0)
1583 return bvec->bv_len;
1584 return max;
1585}
1586
1587static void rbd_free_disk(struct rbd_device *rbd_dev)
1588{
1589 struct gendisk *disk = rbd_dev->disk;
1590
1591 if (!disk)
1592 return;
1593
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001594 if (disk->flags & GENHD_FL_UP)
1595 del_gendisk(disk);
1596 if (disk->queue)
1597 blk_cleanup_queue(disk->queue);
1598 put_disk(disk);
1599}
1600
1601/*
Alex Elder4156d992012-08-02 11:29:46 -05001602 * Read the complete header for the given rbd device.
1603 *
1604 * Returns a pointer to a dynamically-allocated buffer containing
1605 * the complete and validated header. Caller can pass the address
1606 * of a variable that will be filled in with the version of the
1607 * header object at the time it was read.
1608 *
1609 * Returns a pointer-coded errno if a failure occurs.
1610 */
1611static struct rbd_image_header_ondisk *
1612rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1613{
1614 struct rbd_image_header_ondisk *ondisk = NULL;
1615 u32 snap_count = 0;
1616 u64 names_size = 0;
1617 u32 want_count;
1618 int ret;
1619
1620 /*
1621 * The complete header will include an array of its 64-bit
1622 * snapshot ids, followed by the names of those snapshots as
1623 * a contiguous block of NUL-terminated strings. Note that
1624 * the number of snapshots could change by the time we read
1625 * it in, in which case we re-read it.
1626 */
1627 do {
1628 size_t size;
1629
1630 kfree(ondisk);
1631
1632 size = sizeof (*ondisk);
1633 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1634 size += names_size;
1635 ondisk = kmalloc(size, GFP_KERNEL);
1636 if (!ondisk)
1637 return ERR_PTR(-ENOMEM);
1638
1639 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1640 rbd_dev->header_name,
1641 0, size,
1642 (char *) ondisk, version);
1643
1644 if (ret < 0)
1645 goto out_err;
1646 if (WARN_ON((size_t) ret < size)) {
1647 ret = -ENXIO;
1648 pr_warning("short header read for image %s"
1649 " (want %zd got %d)\n",
1650 rbd_dev->image_name, size, ret);
1651 goto out_err;
1652 }
1653 if (!rbd_dev_ondisk_valid(ondisk)) {
1654 ret = -ENXIO;
1655 pr_warning("invalid header for image %s\n",
1656 rbd_dev->image_name);
1657 goto out_err;
1658 }
1659
1660 names_size = le64_to_cpu(ondisk->snap_names_len);
1661 want_count = snap_count;
1662 snap_count = le32_to_cpu(ondisk->snap_count);
1663 } while (snap_count != want_count);
1664
1665 return ondisk;
1666
1667out_err:
1668 kfree(ondisk);
1669
1670 return ERR_PTR(ret);
1671}
1672
1673/*
1674 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001675 */
1676static int rbd_read_header(struct rbd_device *rbd_dev,
1677 struct rbd_image_header *header)
1678{
Alex Elder4156d992012-08-02 11:29:46 -05001679 struct rbd_image_header_ondisk *ondisk;
1680 u64 ver = 0;
1681 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001682
Alex Elder4156d992012-08-02 11:29:46 -05001683 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1684 if (IS_ERR(ondisk))
1685 return PTR_ERR(ondisk);
1686 ret = rbd_header_from_disk(header, ondisk);
1687 if (ret >= 0)
1688 header->obj_version = ver;
1689 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001690
Alex Elder4156d992012-08-02 11:29:46 -05001691 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001692}
1693
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001694static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1695{
1696 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001697 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001698
Alex Eldera0593292012-07-19 09:09:27 -05001699 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001700 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001701}
1702
Alex Elder94785542012-10-09 13:50:17 -07001703static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1704{
1705 sector_t size;
1706
1707 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
1708 return;
1709
1710 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1711 dout("setting size to %llu sectors", (unsigned long long) size);
1712 rbd_dev->mapping.size = (u64) size;
1713 set_capacity(rbd_dev->disk, size);
1714}
1715
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001716/*
1717 * only read the first part of the ondisk header, without the snaps info
1718 */
Alex Elder117973f2012-08-31 17:29:55 -05001719static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001720{
1721 int ret;
1722 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001723
1724 ret = rbd_read_header(rbd_dev, &h);
1725 if (ret < 0)
1726 return ret;
1727
Josh Durgina51aa0c2011-12-05 10:35:04 -08001728 down_write(&rbd_dev->header_rwsem);
1729
Alex Elder94785542012-10-09 13:50:17 -07001730 /* Update image size, and check for resize of mapped image */
1731 rbd_dev->header.image_size = h.image_size;
1732 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001733
Alex Elder849b4262012-07-09 21:04:24 -05001734 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001735 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001736 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001737 /* osd requests may still refer to snapc */
1738 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001739
Alex Elderb8136232012-07-25 09:32:41 -05001740 if (hver)
1741 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001742 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001743 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744 rbd_dev->header.snapc = h.snapc;
1745 rbd_dev->header.snap_names = h.snap_names;
1746 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001747 /* Free the extra copy of the object prefix */
1748 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1749 kfree(h.object_prefix);
1750
Alex Elder304f6802012-08-31 17:29:52 -05001751 ret = rbd_dev_snaps_update(rbd_dev);
1752 if (!ret)
1753 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001754
Josh Durginc6666012011-11-21 17:11:12 -08001755 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001756
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001757 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001758}
1759
Alex Elder117973f2012-08-31 17:29:55 -05001760static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001761{
1762 int ret;
1763
Alex Elder117973f2012-08-31 17:29:55 -05001764 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001765 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001766 if (rbd_dev->image_format == 1)
1767 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1768 else
1769 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001770 mutex_unlock(&ctl_mutex);
1771
1772 return ret;
1773}
1774
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775static int rbd_init_disk(struct rbd_device *rbd_dev)
1776{
1777 struct gendisk *disk;
1778 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001779 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001781 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001782 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1783 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001784 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001785
Alex Elderf0f8cef2012-01-29 13:57:44 -06001786 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001787 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001788 disk->major = rbd_dev->major;
1789 disk->first_minor = 0;
1790 disk->fops = &rbd_bd_ops;
1791 disk->private_data = rbd_dev;
1792
1793 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001794 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1795 if (!q)
1796 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001797
Alex Elder593a9e72012-02-07 12:03:37 -06001798 /* We use the default size, but let's be explicit about it. */
1799 blk_queue_physical_block_size(q, SECTOR_SIZE);
1800
Josh Durgin029bcbd2011-07-22 11:35:23 -07001801 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001802 segment_size = rbd_obj_bytes(&rbd_dev->header);
1803 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1804 blk_queue_max_segment_size(q, segment_size);
1805 blk_queue_io_min(q, segment_size);
1806 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001807
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001808 blk_queue_merge_bvec(q, rbd_merge_bvec);
1809 disk->queue = q;
1810
1811 q->queuedata = rbd_dev;
1812
1813 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001814
Alex Elder12f02942012-08-29 17:11:07 -05001815 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1816
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001817 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001818out_disk:
1819 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001820
1821 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001822}
1823
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001824/*
1825 sysfs
1826*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001827
Alex Elder593a9e72012-02-07 12:03:37 -06001828static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1829{
1830 return container_of(dev, struct rbd_device, dev);
1831}
1832
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001833static ssize_t rbd_size_show(struct device *dev,
1834 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001835{
Alex Elder593a9e72012-02-07 12:03:37 -06001836 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001837 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001838
Josh Durgina51aa0c2011-12-05 10:35:04 -08001839 down_read(&rbd_dev->header_rwsem);
1840 size = get_capacity(rbd_dev->disk);
1841 up_read(&rbd_dev->header_rwsem);
1842
1843 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001844}
1845
Alex Elder34b13182012-07-13 20:35:12 -05001846/*
1847 * Note this shows the features for whatever's mapped, which is not
1848 * necessarily the base image.
1849 */
1850static ssize_t rbd_features_show(struct device *dev,
1851 struct device_attribute *attr, char *buf)
1852{
1853 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1854
1855 return sprintf(buf, "0x%016llx\n",
1856 (unsigned long long) rbd_dev->mapping.features);
1857}
1858
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001859static ssize_t rbd_major_show(struct device *dev,
1860 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001861{
Alex Elder593a9e72012-02-07 12:03:37 -06001862 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001863
1864 return sprintf(buf, "%d\n", rbd_dev->major);
1865}
1866
1867static ssize_t rbd_client_id_show(struct device *dev,
1868 struct device_attribute *attr, char *buf)
1869{
Alex Elder593a9e72012-02-07 12:03:37 -06001870 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001871
Alex Elder1dbb4392012-01-24 10:08:37 -06001872 return sprintf(buf, "client%lld\n",
1873 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001874}
1875
1876static ssize_t rbd_pool_show(struct device *dev,
1877 struct device_attribute *attr, char *buf)
1878{
Alex Elder593a9e72012-02-07 12:03:37 -06001879 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001880
1881 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1882}
1883
Alex Elder9bb2f332012-07-12 10:46:35 -05001884static ssize_t rbd_pool_id_show(struct device *dev,
1885 struct device_attribute *attr, char *buf)
1886{
1887 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1888
1889 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1890}
1891
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001892static ssize_t rbd_name_show(struct device *dev,
1893 struct device_attribute *attr, char *buf)
1894{
Alex Elder593a9e72012-02-07 12:03:37 -06001895 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001896
Alex Elder0bed54d2012-07-03 16:01:18 -05001897 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001898}
1899
Alex Elder589d30e2012-07-10 20:30:11 -05001900static ssize_t rbd_image_id_show(struct device *dev,
1901 struct device_attribute *attr, char *buf)
1902{
1903 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1904
1905 return sprintf(buf, "%s\n", rbd_dev->image_id);
1906}
1907
Alex Elder34b13182012-07-13 20:35:12 -05001908/*
1909 * Shows the name of the currently-mapped snapshot (or
1910 * RBD_SNAP_HEAD_NAME for the base image).
1911 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001912static ssize_t rbd_snap_show(struct device *dev,
1913 struct device_attribute *attr,
1914 char *buf)
1915{
Alex Elder593a9e72012-02-07 12:03:37 -06001916 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917
Alex Elderf84344f2012-08-31 17:29:51 -05001918 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001919}
1920
1921static ssize_t rbd_image_refresh(struct device *dev,
1922 struct device_attribute *attr,
1923 const char *buf,
1924 size_t size)
1925{
Alex Elder593a9e72012-02-07 12:03:37 -06001926 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001927 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001928
Alex Elder117973f2012-08-31 17:29:55 -05001929 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001930
1931 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001932}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001933
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001934static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05001935static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001936static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1937static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1938static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001939static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001940static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05001941static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001942static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1943static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001944
1945static struct attribute *rbd_attrs[] = {
1946 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05001947 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001948 &dev_attr_major.attr,
1949 &dev_attr_client_id.attr,
1950 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001951 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001952 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05001953 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001954 &dev_attr_current_snap.attr,
1955 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001956 NULL
1957};
1958
1959static struct attribute_group rbd_attr_group = {
1960 .attrs = rbd_attrs,
1961};
1962
1963static const struct attribute_group *rbd_attr_groups[] = {
1964 &rbd_attr_group,
1965 NULL
1966};
1967
1968static void rbd_sysfs_dev_release(struct device *dev)
1969{
1970}
1971
1972static struct device_type rbd_device_type = {
1973 .name = "rbd",
1974 .groups = rbd_attr_groups,
1975 .release = rbd_sysfs_dev_release,
1976};
1977
1978
1979/*
1980 sysfs - snapshots
1981*/
1982
1983static ssize_t rbd_snap_size_show(struct device *dev,
1984 struct device_attribute *attr,
1985 char *buf)
1986{
1987 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1988
Josh Durgin35915382011-12-05 18:25:13 -08001989 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001990}
1991
1992static ssize_t rbd_snap_id_show(struct device *dev,
1993 struct device_attribute *attr,
1994 char *buf)
1995{
1996 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1997
Josh Durgin35915382011-12-05 18:25:13 -08001998 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001999}
2000
Alex Elder34b13182012-07-13 20:35:12 -05002001static ssize_t rbd_snap_features_show(struct device *dev,
2002 struct device_attribute *attr,
2003 char *buf)
2004{
2005 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2006
2007 return sprintf(buf, "0x%016llx\n",
2008 (unsigned long long) snap->features);
2009}
2010
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002011static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2012static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002013static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002014
2015static struct attribute *rbd_snap_attrs[] = {
2016 &dev_attr_snap_size.attr,
2017 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002018 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002019 NULL,
2020};
2021
2022static struct attribute_group rbd_snap_attr_group = {
2023 .attrs = rbd_snap_attrs,
2024};
2025
2026static void rbd_snap_dev_release(struct device *dev)
2027{
2028 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2029 kfree(snap->name);
2030 kfree(snap);
2031}
2032
2033static const struct attribute_group *rbd_snap_attr_groups[] = {
2034 &rbd_snap_attr_group,
2035 NULL
2036};
2037
2038static struct device_type rbd_snap_device_type = {
2039 .groups = rbd_snap_attr_groups,
2040 .release = rbd_snap_dev_release,
2041};
2042
Alex Elder304f6802012-08-31 17:29:52 -05002043static bool rbd_snap_registered(struct rbd_snap *snap)
2044{
2045 bool ret = snap->dev.type == &rbd_snap_device_type;
2046 bool reg = device_is_registered(&snap->dev);
2047
2048 rbd_assert(!ret ^ reg);
2049
2050 return ret;
2051}
2052
Alex Elder14e70852012-07-19 09:09:27 -05002053static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002054{
2055 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002056 if (device_is_registered(&snap->dev))
2057 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002058}
2059
Alex Elder14e70852012-07-19 09:09:27 -05002060static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002061 struct device *parent)
2062{
2063 struct device *dev = &snap->dev;
2064 int ret;
2065
2066 dev->type = &rbd_snap_device_type;
2067 dev->parent = parent;
2068 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002069 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002070 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2071
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002072 ret = device_register(dev);
2073
2074 return ret;
2075}
2076
Alex Elder4e891e02012-07-10 20:30:10 -05002077static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002078 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002079 u64 snap_id, u64 snap_size,
2080 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002081{
Alex Elder4e891e02012-07-10 20:30:10 -05002082 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002083 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002084
2085 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002086 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002087 return ERR_PTR(-ENOMEM);
2088
2089 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002090 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002091 if (!snap->name)
2092 goto err;
2093
Alex Elderc8d18422012-07-10 20:30:11 -05002094 snap->id = snap_id;
2095 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002096 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002097
2098 return snap;
2099
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002100err:
2101 kfree(snap->name);
2102 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002103
2104 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105}
2106
Alex Eldercd892122012-07-03 16:01:19 -05002107static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2108 u64 *snap_size, u64 *snap_features)
2109{
2110 char *snap_name;
2111
2112 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2113
2114 *snap_size = rbd_dev->header.snap_sizes[which];
2115 *snap_features = 0; /* No features for v1 */
2116
2117 /* Skip over names until we find the one we are looking for */
2118
2119 snap_name = rbd_dev->header.snap_names;
2120 while (which--)
2121 snap_name += strlen(snap_name) + 1;
2122
2123 return snap_name;
2124}
2125
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002126/*
Alex Elder9d475de2012-07-03 16:01:19 -05002127 * Get the size and object order for an image snapshot, or if
2128 * snap_id is CEPH_NOSNAP, gets this information for the base
2129 * image.
2130 */
2131static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2132 u8 *order, u64 *snap_size)
2133{
2134 __le64 snapid = cpu_to_le64(snap_id);
2135 int ret;
2136 struct {
2137 u8 order;
2138 __le64 size;
2139 } __attribute__ ((packed)) size_buf = { 0 };
2140
2141 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2142 "rbd", "get_size",
2143 (char *) &snapid, sizeof (snapid),
2144 (char *) &size_buf, sizeof (size_buf),
2145 CEPH_OSD_FLAG_READ, NULL);
2146 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2147 if (ret < 0)
2148 return ret;
2149
2150 *order = size_buf.order;
2151 *snap_size = le64_to_cpu(size_buf.size);
2152
2153 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2154 (unsigned long long) snap_id, (unsigned int) *order,
2155 (unsigned long long) *snap_size);
2156
2157 return 0;
2158}
2159
2160static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2161{
2162 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2163 &rbd_dev->header.obj_order,
2164 &rbd_dev->header.image_size);
2165}
2166
Alex Elder1e130192012-07-03 16:01:19 -05002167static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2168{
2169 void *reply_buf;
2170 int ret;
2171 void *p;
2172
2173 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2174 if (!reply_buf)
2175 return -ENOMEM;
2176
2177 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2178 "rbd", "get_object_prefix",
2179 NULL, 0,
2180 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2181 CEPH_OSD_FLAG_READ, NULL);
2182 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2183 if (ret < 0)
2184 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002185 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002186
2187 p = reply_buf;
2188 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2189 p + RBD_OBJ_PREFIX_LEN_MAX,
2190 NULL, GFP_NOIO);
2191
2192 if (IS_ERR(rbd_dev->header.object_prefix)) {
2193 ret = PTR_ERR(rbd_dev->header.object_prefix);
2194 rbd_dev->header.object_prefix = NULL;
2195 } else {
2196 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2197 }
2198
2199out:
2200 kfree(reply_buf);
2201
2202 return ret;
2203}
2204
Alex Elderb1b54022012-07-03 16:01:19 -05002205static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2206 u64 *snap_features)
2207{
2208 __le64 snapid = cpu_to_le64(snap_id);
2209 struct {
2210 __le64 features;
2211 __le64 incompat;
2212 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002213 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002214 int ret;
2215
2216 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2217 "rbd", "get_features",
2218 (char *) &snapid, sizeof (snapid),
2219 (char *) &features_buf, sizeof (features_buf),
2220 CEPH_OSD_FLAG_READ, NULL);
2221 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2222 if (ret < 0)
2223 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002224
2225 incompat = le64_to_cpu(features_buf.incompat);
2226 if (incompat & ~RBD_FEATURES_ALL)
2227 return -ENOTSUPP;
2228
Alex Elderb1b54022012-07-03 16:01:19 -05002229 *snap_features = le64_to_cpu(features_buf.features);
2230
2231 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2232 (unsigned long long) snap_id,
2233 (unsigned long long) *snap_features,
2234 (unsigned long long) le64_to_cpu(features_buf.incompat));
2235
2236 return 0;
2237}
2238
2239static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2240{
2241 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2242 &rbd_dev->header.features);
2243}
2244
Alex Elder6e14b1a2012-07-03 16:01:19 -05002245static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002246{
2247 size_t size;
2248 int ret;
2249 void *reply_buf;
2250 void *p;
2251 void *end;
2252 u64 seq;
2253 u32 snap_count;
2254 struct ceph_snap_context *snapc;
2255 u32 i;
2256
2257 /*
2258 * We'll need room for the seq value (maximum snapshot id),
2259 * snapshot count, and array of that many snapshot ids.
2260 * For now we have a fixed upper limit on the number we're
2261 * prepared to receive.
2262 */
2263 size = sizeof (__le64) + sizeof (__le32) +
2264 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2265 reply_buf = kzalloc(size, GFP_KERNEL);
2266 if (!reply_buf)
2267 return -ENOMEM;
2268
2269 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2270 "rbd", "get_snapcontext",
2271 NULL, 0,
2272 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002273 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002274 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2275 if (ret < 0)
2276 goto out;
2277
2278 ret = -ERANGE;
2279 p = reply_buf;
2280 end = (char *) reply_buf + size;
2281 ceph_decode_64_safe(&p, end, seq, out);
2282 ceph_decode_32_safe(&p, end, snap_count, out);
2283
2284 /*
2285 * Make sure the reported number of snapshot ids wouldn't go
2286 * beyond the end of our buffer. But before checking that,
2287 * make sure the computed size of the snapshot context we
2288 * allocate is representable in a size_t.
2289 */
2290 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2291 / sizeof (u64)) {
2292 ret = -EINVAL;
2293 goto out;
2294 }
2295 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2296 goto out;
2297
2298 size = sizeof (struct ceph_snap_context) +
2299 snap_count * sizeof (snapc->snaps[0]);
2300 snapc = kmalloc(size, GFP_KERNEL);
2301 if (!snapc) {
2302 ret = -ENOMEM;
2303 goto out;
2304 }
2305
2306 atomic_set(&snapc->nref, 1);
2307 snapc->seq = seq;
2308 snapc->num_snaps = snap_count;
2309 for (i = 0; i < snap_count; i++)
2310 snapc->snaps[i] = ceph_decode_64(&p);
2311
2312 rbd_dev->header.snapc = snapc;
2313
2314 dout(" snap context seq = %llu, snap_count = %u\n",
2315 (unsigned long long) seq, (unsigned int) snap_count);
2316
2317out:
2318 kfree(reply_buf);
2319
2320 return 0;
2321}
2322
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002323static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2324{
2325 size_t size;
2326 void *reply_buf;
2327 __le64 snap_id;
2328 int ret;
2329 void *p;
2330 void *end;
2331 size_t snap_name_len;
2332 char *snap_name;
2333
2334 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2335 reply_buf = kmalloc(size, GFP_KERNEL);
2336 if (!reply_buf)
2337 return ERR_PTR(-ENOMEM);
2338
2339 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2340 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2341 "rbd", "get_snapshot_name",
2342 (char *) &snap_id, sizeof (snap_id),
2343 reply_buf, size,
2344 CEPH_OSD_FLAG_READ, NULL);
2345 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2346 if (ret < 0)
2347 goto out;
2348
2349 p = reply_buf;
2350 end = (char *) reply_buf + size;
2351 snap_name_len = 0;
2352 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2353 GFP_KERNEL);
2354 if (IS_ERR(snap_name)) {
2355 ret = PTR_ERR(snap_name);
2356 goto out;
2357 } else {
2358 dout(" snap_id 0x%016llx snap_name = %s\n",
2359 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2360 }
2361 kfree(reply_buf);
2362
2363 return snap_name;
2364out:
2365 kfree(reply_buf);
2366
2367 return ERR_PTR(ret);
2368}
2369
2370static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2371 u64 *snap_size, u64 *snap_features)
2372{
2373 __le64 snap_id;
2374 u8 order;
2375 int ret;
2376
2377 snap_id = rbd_dev->header.snapc->snaps[which];
2378 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2379 if (ret)
2380 return ERR_PTR(ret);
2381 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2382 if (ret)
2383 return ERR_PTR(ret);
2384
2385 return rbd_dev_v2_snap_name(rbd_dev, which);
2386}
2387
2388static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2389 u64 *snap_size, u64 *snap_features)
2390{
2391 if (rbd_dev->image_format == 1)
2392 return rbd_dev_v1_snap_info(rbd_dev, which,
2393 snap_size, snap_features);
2394 if (rbd_dev->image_format == 2)
2395 return rbd_dev_v2_snap_info(rbd_dev, which,
2396 snap_size, snap_features);
2397 return ERR_PTR(-EINVAL);
2398}
2399
Alex Elder117973f2012-08-31 17:29:55 -05002400static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2401{
2402 int ret;
2403 __u8 obj_order;
2404
2405 down_write(&rbd_dev->header_rwsem);
2406
2407 /* Grab old order first, to see if it changes */
2408
2409 obj_order = rbd_dev->header.obj_order,
2410 ret = rbd_dev_v2_image_size(rbd_dev);
2411 if (ret)
2412 goto out;
2413 if (rbd_dev->header.obj_order != obj_order) {
2414 ret = -EIO;
2415 goto out;
2416 }
2417 rbd_update_mapping_size(rbd_dev);
2418
2419 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2420 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2421 if (ret)
2422 goto out;
2423 ret = rbd_dev_snaps_update(rbd_dev);
2424 dout("rbd_dev_snaps_update returned %d\n", ret);
2425 if (ret)
2426 goto out;
2427 ret = rbd_dev_snaps_register(rbd_dev);
2428 dout("rbd_dev_snaps_register returned %d\n", ret);
2429out:
2430 up_write(&rbd_dev->header_rwsem);
2431
2432 return ret;
2433}
2434
Alex Elder9d475de2012-07-03 16:01:19 -05002435/*
Alex Elder35938152012-08-02 11:29:46 -05002436 * Scan the rbd device's current snapshot list and compare it to the
2437 * newly-received snapshot context. Remove any existing snapshots
2438 * not present in the new snapshot context. Add a new snapshot for
2439 * any snaphots in the snapshot context not in the current list.
2440 * And verify there are no changes to snapshots we already know
2441 * about.
2442 *
2443 * Assumes the snapshots in the snapshot context are sorted by
2444 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2445 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002446 */
Alex Elder304f6802012-08-31 17:29:52 -05002447static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002448{
Alex Elder35938152012-08-02 11:29:46 -05002449 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2450 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002451 struct list_head *head = &rbd_dev->snaps;
2452 struct list_head *links = head->next;
2453 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002454
Alex Elder9fcbb802012-08-23 23:48:49 -05002455 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002456 while (index < snap_count || links != head) {
2457 u64 snap_id;
2458 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002459 char *snap_name;
2460 u64 snap_size = 0;
2461 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002462
Alex Elder35938152012-08-02 11:29:46 -05002463 snap_id = index < snap_count ? snapc->snaps[index]
2464 : CEPH_NOSNAP;
2465 snap = links != head ? list_entry(links, struct rbd_snap, node)
2466 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002467 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002468
Alex Elder35938152012-08-02 11:29:46 -05002469 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2470 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002471
Alex Elder35938152012-08-02 11:29:46 -05002472 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002473
Alex Elderf84344f2012-08-31 17:29:51 -05002474 if (rbd_dev->mapping.snap_id == snap->id)
2475 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002476 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002477 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002478 rbd_dev->mapping.snap_id == snap->id ?
2479 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002480 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002481
Alex Elder35938152012-08-02 11:29:46 -05002482 /* Done with this list entry; advance */
2483
2484 links = next;
2485 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002486 }
Alex Elder35938152012-08-02 11:29:46 -05002487
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002488 snap_name = rbd_dev_snap_info(rbd_dev, index,
2489 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002490 if (IS_ERR(snap_name))
2491 return PTR_ERR(snap_name);
2492
Alex Elder9fcbb802012-08-23 23:48:49 -05002493 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2494 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002495 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2496 struct rbd_snap *new_snap;
2497
2498 /* We haven't seen this snapshot before */
2499
Alex Elderc8d18422012-07-10 20:30:11 -05002500 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002501 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002502 if (IS_ERR(new_snap)) {
2503 int err = PTR_ERR(new_snap);
2504
2505 dout(" failed to add dev, error %d\n", err);
2506
2507 return err;
2508 }
Alex Elder35938152012-08-02 11:29:46 -05002509
2510 /* New goes before existing, or at end of list */
2511
Alex Elder9fcbb802012-08-23 23:48:49 -05002512 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002513 if (snap)
2514 list_add_tail(&new_snap->node, &snap->node);
2515 else
Alex Elder523f3252012-08-30 00:16:37 -05002516 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002517 } else {
2518 /* Already have this one */
2519
Alex Elder9fcbb802012-08-23 23:48:49 -05002520 dout(" already present\n");
2521
Alex Eldercd892122012-07-03 16:01:19 -05002522 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002523 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002524 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002525
2526 /* Done with this list entry; advance */
2527
2528 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002529 }
Alex Elder35938152012-08-02 11:29:46 -05002530
2531 /* Advance to the next entry in the snapshot context */
2532
2533 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002534 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002535 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002536
2537 return 0;
2538}
2539
Alex Elder304f6802012-08-31 17:29:52 -05002540/*
2541 * Scan the list of snapshots and register the devices for any that
2542 * have not already been registered.
2543 */
2544static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2545{
2546 struct rbd_snap *snap;
2547 int ret = 0;
2548
2549 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002550 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2551 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002552
2553 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2554 if (!rbd_snap_registered(snap)) {
2555 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2556 if (ret < 0)
2557 break;
2558 }
2559 }
2560 dout("%s: returning %d\n", __func__, ret);
2561
2562 return ret;
2563}
2564
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002565static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2566{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002567 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002568 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002569
2570 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002571
Alex Eldercd789ab2012-08-30 00:16:38 -05002572 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002573 dev->bus = &rbd_bus_type;
2574 dev->type = &rbd_device_type;
2575 dev->parent = &rbd_root_dev;
2576 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002577 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002578 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002579
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002580 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002581
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002582 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002583}
2584
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002585static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2586{
2587 device_unregister(&rbd_dev->dev);
2588}
2589
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002590static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2591{
2592 int ret, rc;
2593
2594 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002595 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002596 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002597 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002598 if (rc < 0)
2599 return rc;
2600 }
2601 } while (ret == -ERANGE);
2602
2603 return ret;
2604}
2605
Alex Eldere2839302012-08-29 17:11:06 -05002606static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002607
2608/*
Alex Elder499afd52012-02-02 08:13:29 -06002609 * Get a unique rbd identifier for the given new rbd_dev, and add
2610 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002611 */
Alex Eldere2839302012-08-29 17:11:06 -05002612static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002613{
Alex Eldere2839302012-08-29 17:11:06 -05002614 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002615
2616 spin_lock(&rbd_dev_list_lock);
2617 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2618 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002619 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2620 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002621}
Alex Elderb7f23c32012-01-29 13:57:43 -06002622
Alex Elder1ddbe942012-01-29 13:57:44 -06002623/*
Alex Elder499afd52012-02-02 08:13:29 -06002624 * Remove an rbd_dev from the global list, and record that its
2625 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002626 */
Alex Eldere2839302012-08-29 17:11:06 -05002627static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002628{
Alex Elderd184f6b2012-01-29 13:57:44 -06002629 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002630 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002631 int max_id;
2632
Alex Elderaafb2302012-09-06 16:00:54 -05002633 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002634
Alex Eldere2839302012-08-29 17:11:06 -05002635 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2636 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002637 spin_lock(&rbd_dev_list_lock);
2638 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002639
2640 /*
2641 * If the id being "put" is not the current maximum, there
2642 * is nothing special we need to do.
2643 */
Alex Eldere2839302012-08-29 17:11:06 -05002644 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002645 spin_unlock(&rbd_dev_list_lock);
2646 return;
2647 }
2648
2649 /*
2650 * We need to update the current maximum id. Search the
2651 * list to find out what it is. We're more likely to find
2652 * the maximum at the end, so search the list backward.
2653 */
2654 max_id = 0;
2655 list_for_each_prev(tmp, &rbd_dev_list) {
2656 struct rbd_device *rbd_dev;
2657
2658 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002659 if (rbd_dev->dev_id > max_id)
2660 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002661 }
Alex Elder499afd52012-02-02 08:13:29 -06002662 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002663
Alex Elder1ddbe942012-01-29 13:57:44 -06002664 /*
Alex Eldere2839302012-08-29 17:11:06 -05002665 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002666 * which case it now accurately reflects the new maximum.
2667 * Be careful not to overwrite the maximum value in that
2668 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002669 */
Alex Eldere2839302012-08-29 17:11:06 -05002670 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2671 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002672}
2673
Alex Eldera725f65e2012-02-02 08:13:30 -06002674/*
Alex Eldere28fff262012-02-02 08:13:30 -06002675 * Skips over white space at *buf, and updates *buf to point to the
2676 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002677 * the token (string of non-white space characters) found. Note
2678 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002679 */
2680static inline size_t next_token(const char **buf)
2681{
2682 /*
2683 * These are the characters that produce nonzero for
2684 * isspace() in the "C" and "POSIX" locales.
2685 */
2686 const char *spaces = " \f\n\r\t\v";
2687
2688 *buf += strspn(*buf, spaces); /* Find start of token */
2689
2690 return strcspn(*buf, spaces); /* Return token length */
2691}
2692
2693/*
2694 * Finds the next token in *buf, and if the provided token buffer is
2695 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002696 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2697 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002698 *
2699 * Returns the length of the token found (not including the '\0').
2700 * Return value will be 0 if no token is found, and it will be >=
2701 * token_size if the token would not fit.
2702 *
Alex Elder593a9e72012-02-07 12:03:37 -06002703 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002704 * found token. Note that this occurs even if the token buffer is
2705 * too small to hold it.
2706 */
2707static inline size_t copy_token(const char **buf,
2708 char *token,
2709 size_t token_size)
2710{
2711 size_t len;
2712
2713 len = next_token(buf);
2714 if (len < token_size) {
2715 memcpy(token, *buf, len);
2716 *(token + len) = '\0';
2717 }
2718 *buf += len;
2719
2720 return len;
2721}
2722
2723/*
Alex Elderea3352f2012-07-09 21:04:23 -05002724 * Finds the next token in *buf, dynamically allocates a buffer big
2725 * enough to hold a copy of it, and copies the token into the new
2726 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2727 * that a duplicate buffer is created even for a zero-length token.
2728 *
2729 * Returns a pointer to the newly-allocated duplicate, or a null
2730 * pointer if memory for the duplicate was not available. If
2731 * the lenp argument is a non-null pointer, the length of the token
2732 * (not including the '\0') is returned in *lenp.
2733 *
2734 * If successful, the *buf pointer will be updated to point beyond
2735 * the end of the found token.
2736 *
2737 * Note: uses GFP_KERNEL for allocation.
2738 */
2739static inline char *dup_token(const char **buf, size_t *lenp)
2740{
2741 char *dup;
2742 size_t len;
2743
2744 len = next_token(buf);
2745 dup = kmalloc(len + 1, GFP_KERNEL);
2746 if (!dup)
2747 return NULL;
2748
2749 memcpy(dup, *buf, len);
2750 *(dup + len) = '\0';
2751 *buf += len;
2752
2753 if (lenp)
2754 *lenp = len;
2755
2756 return dup;
2757}
2758
2759/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002760 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2761 * rbd_md_name, and name fields of the given rbd_dev, based on the
2762 * list of monitor addresses and other options provided via
2763 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2764 * copy of the snapshot name to map if successful, or a
2765 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002766 *
2767 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002768 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002769static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2770 const char *buf,
2771 const char **mon_addrs,
2772 size_t *mon_addrs_size,
2773 char *options,
2774 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002775{
Alex Elderd22f76e2012-07-12 10:46:35 -05002776 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002777 char *err_ptr = ERR_PTR(-EINVAL);
2778 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002779
2780 /* The first four tokens are required */
2781
Alex Elder7ef32142012-02-02 08:13:30 -06002782 len = next_token(&buf);
2783 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002784 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002785 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002786 *mon_addrs = buf;
2787
2788 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002789
Alex Eldere28fff262012-02-02 08:13:30 -06002790 len = copy_token(&buf, options, options_size);
2791 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002792 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002793
Alex Elder3feeb8942012-08-31 17:29:52 -05002794 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002795 rbd_dev->pool_name = dup_token(&buf, NULL);
2796 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002797 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002798
Alex Elder0bed54d2012-07-03 16:01:18 -05002799 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2800 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002801 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002802
Alex Elderd4b125e2012-07-03 16:01:19 -05002803 /* Snapshot name is optional; default is to use "head" */
2804
Alex Elder3feeb8942012-08-31 17:29:52 -05002805 len = next_token(&buf);
Alex Elderd4b125e2012-07-03 16:01:19 -05002806 if (len > RBD_MAX_SNAP_NAME_LEN) {
2807 err_ptr = ERR_PTR(-ENAMETOOLONG);
2808 goto out_err;
2809 }
Alex Elder820a5f32012-07-09 21:04:24 -05002810 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002811 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2812 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002813 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002814 snap_name = kmalloc(len + 1, GFP_KERNEL);
2815 if (!snap_name)
2816 goto out_err;
2817 memcpy(snap_name, buf, len);
2818 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002819
Alex Elder3feeb8942012-08-31 17:29:52 -05002820 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002821
2822out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002823 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002824 rbd_dev->image_name = NULL;
2825 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002826 kfree(rbd_dev->pool_name);
2827 rbd_dev->pool_name = NULL;
2828
Alex Elder3feeb8942012-08-31 17:29:52 -05002829 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002830}
2831
Alex Elder589d30e2012-07-10 20:30:11 -05002832/*
2833 * An rbd format 2 image has a unique identifier, distinct from the
2834 * name given to it by the user. Internally, that identifier is
2835 * what's used to specify the names of objects related to the image.
2836 *
2837 * A special "rbd id" object is used to map an rbd image name to its
2838 * id. If that object doesn't exist, then there is no v2 rbd image
2839 * with the supplied name.
2840 *
2841 * This function will record the given rbd_dev's image_id field if
2842 * it can be determined, and in that case will return 0. If any
2843 * errors occur a negative errno will be returned and the rbd_dev's
2844 * image_id field will be unchanged (and should be NULL).
2845 */
2846static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2847{
2848 int ret;
2849 size_t size;
2850 char *object_name;
2851 void *response;
2852 void *p;
2853
2854 /*
2855 * First, see if the format 2 image id file exists, and if
2856 * so, get the image's persistent id from it.
2857 */
2858 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2859 object_name = kmalloc(size, GFP_NOIO);
2860 if (!object_name)
2861 return -ENOMEM;
2862 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2863 dout("rbd id object name is %s\n", object_name);
2864
2865 /* Response will be an encoded string, which includes a length */
2866
2867 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2868 response = kzalloc(size, GFP_NOIO);
2869 if (!response) {
2870 ret = -ENOMEM;
2871 goto out;
2872 }
2873
2874 ret = rbd_req_sync_exec(rbd_dev, object_name,
2875 "rbd", "get_id",
2876 NULL, 0,
2877 response, RBD_IMAGE_ID_LEN_MAX,
2878 CEPH_OSD_FLAG_READ, NULL);
2879 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2880 if (ret < 0)
2881 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002882 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05002883
2884 p = response;
2885 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2886 p + RBD_IMAGE_ID_LEN_MAX,
2887 &rbd_dev->image_id_len,
2888 GFP_NOIO);
2889 if (IS_ERR(rbd_dev->image_id)) {
2890 ret = PTR_ERR(rbd_dev->image_id);
2891 rbd_dev->image_id = NULL;
2892 } else {
2893 dout("image_id is %s\n", rbd_dev->image_id);
2894 }
2895out:
2896 kfree(response);
2897 kfree(object_name);
2898
2899 return ret;
2900}
2901
Alex Eldera30b71b2012-07-10 20:30:11 -05002902static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2903{
2904 int ret;
2905 size_t size;
2906
2907 /* Version 1 images have no id; empty string is used */
2908
2909 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2910 if (!rbd_dev->image_id)
2911 return -ENOMEM;
2912 rbd_dev->image_id_len = 0;
2913
2914 /* Record the header object name for this rbd image. */
2915
2916 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2917 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2918 if (!rbd_dev->header_name) {
2919 ret = -ENOMEM;
2920 goto out_err;
2921 }
2922 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2923
2924 /* Populate rbd image metadata */
2925
2926 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2927 if (ret < 0)
2928 goto out_err;
2929 rbd_dev->image_format = 1;
2930
2931 dout("discovered version 1 image, header name is %s\n",
2932 rbd_dev->header_name);
2933
2934 return 0;
2935
2936out_err:
2937 kfree(rbd_dev->header_name);
2938 rbd_dev->header_name = NULL;
2939 kfree(rbd_dev->image_id);
2940 rbd_dev->image_id = NULL;
2941
2942 return ret;
2943}
2944
2945static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2946{
2947 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05002948 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05002949 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05002950
2951 /*
2952 * Image id was filled in by the caller. Record the header
2953 * object name for this rbd image.
2954 */
2955 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2956 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2957 if (!rbd_dev->header_name)
2958 return -ENOMEM;
2959 sprintf(rbd_dev->header_name, "%s%s",
2960 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05002961
2962 /* Get the size and object order for the image */
2963
2964 ret = rbd_dev_v2_image_size(rbd_dev);
2965 if (ret < 0)
2966 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05002967
2968 /* Get the object prefix (a.k.a. block_name) for the image */
2969
2970 ret = rbd_dev_v2_object_prefix(rbd_dev);
2971 if (ret < 0)
2972 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05002973
Alex Elderd8891402012-10-09 13:50:17 -07002974 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05002975
2976 ret = rbd_dev_v2_features(rbd_dev);
2977 if (ret < 0)
2978 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05002979
Alex Elder6e14b1a2012-07-03 16:01:19 -05002980 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05002981
Alex Elder6e14b1a2012-07-03 16:01:19 -05002982 rbd_dev->header.crypt_type = 0;
2983 rbd_dev->header.comp_type = 0;
2984
2985 /* Get the snapshot context, plus the header version */
2986
2987 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002988 if (ret)
2989 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05002990 rbd_dev->header.obj_version = ver;
2991
Alex Eldera30b71b2012-07-10 20:30:11 -05002992 rbd_dev->image_format = 2;
2993
2994 dout("discovered version 2 image, header name is %s\n",
2995 rbd_dev->header_name);
2996
Alex Elder35152972012-08-31 17:29:55 -05002997 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05002998out_err:
2999 kfree(rbd_dev->header_name);
3000 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003001 kfree(rbd_dev->header.object_prefix);
3002 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003003
3004 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003005}
3006
3007/*
3008 * Probe for the existence of the header object for the given rbd
3009 * device. For format 2 images this includes determining the image
3010 * id.
3011 */
3012static int rbd_dev_probe(struct rbd_device *rbd_dev)
3013{
3014 int ret;
3015
3016 /*
3017 * Get the id from the image id object. If it's not a
3018 * format 2 image, we'll get ENOENT back, and we'll assume
3019 * it's a format 1 image.
3020 */
3021 ret = rbd_dev_image_id(rbd_dev);
3022 if (ret)
3023 ret = rbd_dev_v1_probe(rbd_dev);
3024 else
3025 ret = rbd_dev_v2_probe(rbd_dev);
3026 if (ret)
3027 dout("probe failed, returning %d\n", ret);
3028
3029 return ret;
3030}
3031
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003032static ssize_t rbd_add(struct bus_type *bus,
3033 const char *buf,
3034 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003035{
Alex Eldercb8627c2012-07-09 21:04:23 -05003036 char *options;
3037 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06003038 const char *mon_addrs = NULL;
3039 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06003040 struct ceph_osd_client *osdc;
3041 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05003042 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003043
3044 if (!try_module_get(THIS_MODULE))
3045 return -ENODEV;
3046
Alex Elder27cc2592012-02-02 08:13:30 -06003047 options = kmalloc(count, GFP_KERNEL);
3048 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05003049 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05003050 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3051 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05003052 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003053
3054 /* static rbd_device initialization */
3055 spin_lock_init(&rbd_dev->lock);
3056 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003057 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08003058 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003059
Alex Eldera725f65e2012-02-02 08:13:30 -06003060 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05003061 snap_name = rbd_add_parse_args(rbd_dev, buf,
3062 &mon_addrs, &mon_addrs_size, options, count);
3063 if (IS_ERR(snap_name)) {
3064 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003065 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05003066 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003067
Alex Elderf8c38922012-08-10 13:12:07 -07003068 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3069 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003070 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003071
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003072 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06003073 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003074 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3075 if (rc < 0)
3076 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05003077 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003078
Alex Eldera30b71b2012-07-10 20:30:11 -05003079 rc = rbd_dev_probe(rbd_dev);
3080 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05003081 goto err_out_client;
Alex Elder05fd6f62012-08-29 17:11:07 -05003082
3083 /* no need to lock here, as rbd_dev is not registered yet */
3084 rc = rbd_dev_snaps_update(rbd_dev);
3085 if (rc)
3086 goto err_out_header;
3087
3088 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3089 if (rc)
3090 goto err_out_header;
3091
Alex Elder85ae8922012-07-26 23:37:14 -05003092 /* generate unique id: find highest unique id, add one */
3093 rbd_dev_id_get(rbd_dev);
3094
3095 /* Fill in the device name, now that we have its id. */
3096 BUILD_BUG_ON(DEV_NAME_LEN
3097 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3098 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3099
3100 /* Get our block major device number. */
3101
Alex Elder27cc2592012-02-02 08:13:30 -06003102 rc = register_blkdev(0, rbd_dev->name);
3103 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003104 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003105 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003106
Alex Elder0f308a32012-08-29 17:11:07 -05003107 /* Set up the blkdev mapping. */
3108
3109 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003110 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003111 goto err_out_blkdev;
3112
Alex Elder0f308a32012-08-29 17:11:07 -05003113 rc = rbd_bus_add_dev(rbd_dev);
3114 if (rc)
3115 goto err_out_disk;
3116
Alex Elder32eec682012-02-08 16:11:14 -06003117 /*
3118 * At this point cleanup in the event of an error is the job
3119 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003120 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003121
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003122 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003123 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003124 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003125 if (rc)
3126 goto err_out_bus;
3127
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003128 rc = rbd_init_watch_dev(rbd_dev);
3129 if (rc)
3130 goto err_out_bus;
3131
Alex Elder3ee40012012-08-29 17:11:07 -05003132 /* Everything's ready. Announce the disk to the world. */
3133
3134 add_disk(rbd_dev->disk);
3135
3136 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3137 (unsigned long long) rbd_dev->mapping.size);
3138
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003139 return count;
3140
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003141err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003142 /* this will also clean up rest of rbd_dev stuff */
3143
3144 rbd_bus_del_dev(rbd_dev);
3145 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003146 return rc;
3147
Alex Elder0f308a32012-08-29 17:11:07 -05003148err_out_disk:
3149 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003150err_out_blkdev:
3151 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003152err_out_id:
3153 rbd_dev_id_put(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05003154err_out_header:
3155 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003156err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05003157 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003158 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05003159 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05003160err_out_args:
3161 kfree(rbd_dev->mapping.snap_name);
3162 kfree(rbd_dev->image_name);
3163 kfree(rbd_dev->pool_name);
3164err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06003165 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05003166 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06003167
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003168 dout("Error adding device %s\n", buf);
3169 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003170
3171 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003172}
3173
Alex Elderde71a292012-07-03 16:01:19 -05003174static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003175{
3176 struct list_head *tmp;
3177 struct rbd_device *rbd_dev;
3178
Alex Eldere124a82f2012-01-29 13:57:44 -06003179 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003180 list_for_each(tmp, &rbd_dev_list) {
3181 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003182 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06003183 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003184 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06003185 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003186 }
Alex Eldere124a82f2012-01-29 13:57:44 -06003187 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003188 return NULL;
3189}
3190
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003191static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003192{
Alex Elder593a9e72012-02-07 12:03:37 -06003193 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003194
Alex Elder1dbb4392012-01-24 10:08:37 -06003195 if (rbd_dev->watch_request) {
3196 struct ceph_client *client = rbd_dev->rbd_client->client;
3197
3198 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003199 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003200 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003201 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003202 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003203
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003204 rbd_put_client(rbd_dev);
3205
3206 /* clean up and free blkdev */
3207 rbd_free_disk(rbd_dev);
3208 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003209
Alex Elder2ac4e752012-07-10 20:30:10 -05003210 /* release allocated disk header fields */
3211 rbd_header_free(&rbd_dev->header);
3212
Alex Elder32eec682012-02-08 16:11:14 -06003213 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05003214 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003215 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05003216 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05003217 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05003218 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05003219 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003220 kfree(rbd_dev);
3221
3222 /* release module ref */
3223 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003224}
3225
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003226static ssize_t rbd_remove(struct bus_type *bus,
3227 const char *buf,
3228 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003229{
3230 struct rbd_device *rbd_dev = NULL;
3231 int target_id, rc;
3232 unsigned long ul;
3233 int ret = count;
3234
3235 rc = strict_strtoul(buf, 10, &ul);
3236 if (rc)
3237 return rc;
3238
3239 /* convert to int; abort if we lost anything in the conversion */
3240 target_id = (int) ul;
3241 if (target_id != ul)
3242 return -EINVAL;
3243
3244 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3245
3246 rbd_dev = __rbd_get_dev(target_id);
3247 if (!rbd_dev) {
3248 ret = -ENOENT;
3249 goto done;
3250 }
3251
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003252 __rbd_remove_all_snaps(rbd_dev);
3253 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003254
3255done:
3256 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003257
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003258 return ret;
3259}
3260
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003261/*
3262 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003263 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003264 */
3265static int rbd_sysfs_init(void)
3266{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003267 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003268
Alex Elderfed4c142012-02-07 12:03:36 -06003269 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003270 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003271 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003272
Alex Elderfed4c142012-02-07 12:03:36 -06003273 ret = bus_register(&rbd_bus_type);
3274 if (ret < 0)
3275 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003276
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003277 return ret;
3278}
3279
3280static void rbd_sysfs_cleanup(void)
3281{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003282 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003283 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003284}
3285
3286int __init rbd_init(void)
3287{
3288 int rc;
3289
3290 rc = rbd_sysfs_init();
3291 if (rc)
3292 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003293 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003294 return 0;
3295}
3296
3297void __exit rbd_exit(void)
3298{
3299 rbd_sysfs_cleanup();
3300}
3301
3302module_init(rbd_init);
3303module_exit(rbd_exit);
3304
3305MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3306MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3307MODULE_DESCRIPTION("rados block device");
3308
3309/* following authorship retained from original osdblk.c */
3310MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3311
3312MODULE_LICENSE("GPL");