blob: 76fbfa120064dc6b1303324dc86c4665371ddb09 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050075
Alex Elderd8891402012-10-09 13:50:17 -070076/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
Alex Elder81a89792012-02-02 08:13:30 -060084/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060091#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Eldercc0538b2012-08-10 13:12:07 -070093#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070094
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
112 u64 obj_version;
113};
114
115struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700116 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117};
118
119/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600120 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600129 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700130 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145};
146
Alex Elderf0f8cef2012-01-29 13:57:44 -0600147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800159struct rbd_snap {
160 struct device dev;
161 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800162 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800163 struct list_head node;
164 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500165 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166};
167
Alex Elderf84344f2012-08-31 17:29:51 -0500168struct rbd_mapping {
169 char *snap_name;
170 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500171 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500172 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500173 bool snap_exists;
174 bool read_only;
175};
176
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700177/*
178 * a single device
179 */
180struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500181 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
183 int major; /* blkdev assigned major */
184 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700185
Alex Eldera30b71b2012-07-10 20:30:11 -0500186 u32 image_format; /* Either 1 or 2 */
Alex Elderf8c38922012-08-10 13:12:07 -0700187 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188 struct rbd_client *rbd_client;
189
190 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
191
192 spinlock_t lock; /* queue lock */
193
194 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500195 char *image_id;
196 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500197 char *image_name;
198 size_t image_name_len;
199 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500200 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500201 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700202
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700203 struct ceph_osd_event *watch_event;
204 struct ceph_osd_request *watch_request;
205
Josh Durginc6666012011-11-21 17:11:12 -0800206 /* protects updating the header */
207 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500208
209 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700210
211 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800212
213 /* list of snapshots */
214 struct list_head snaps;
215
216 /* sysfs related */
217 struct device dev;
218};
219
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700220static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600221
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700222static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600223static DEFINE_SPINLOCK(rbd_dev_list_lock);
224
Alex Elder432b8582012-01-29 13:57:44 -0600225static LIST_HEAD(rbd_client_list); /* clients */
226static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700227
Alex Elder304f6802012-08-31 17:29:52 -0500228static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
229static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231static void rbd_dev_release(struct device *dev);
Alex Elder14e70852012-07-19 09:09:27 -0500232static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233
Alex Elderf0f8cef2012-01-29 13:57:44 -0600234static ssize_t rbd_add(struct bus_type *bus, const char *buf,
235 size_t count);
236static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
237 size_t count);
238
239static struct bus_attribute rbd_bus_attrs[] = {
240 __ATTR(add, S_IWUSR, NULL, rbd_add),
241 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
242 __ATTR_NULL
243};
244
245static struct bus_type rbd_bus_type = {
246 .name = "rbd",
247 .bus_attrs = rbd_bus_attrs,
248};
249
250static void rbd_root_dev_release(struct device *dev)
251{
252}
253
254static struct device rbd_root_dev = {
255 .init_name = "rbd",
256 .release = rbd_root_dev_release,
257};
258
Alex Elderaafb2302012-09-06 16:00:54 -0500259#ifdef RBD_DEBUG
260#define rbd_assert(expr) \
261 if (unlikely(!(expr))) { \
262 printk(KERN_ERR "\nAssertion failure in %s() " \
263 "at line %d:\n\n" \
264 "\trbd_assert(%s);\n\n", \
265 __func__, __LINE__, #expr); \
266 BUG(); \
267 }
268#else /* !RBD_DEBUG */
269# define rbd_assert(expr) ((void) 0)
270#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800272static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
273{
274 return get_device(&rbd_dev->dev);
275}
276
277static void rbd_put_dev(struct rbd_device *rbd_dev)
278{
279 put_device(&rbd_dev->dev);
280}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281
Alex Elder117973f2012-08-31 17:29:55 -0500282static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
283static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700284
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700285static int rbd_open(struct block_device *bdev, fmode_t mode)
286{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600287 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700288
Alex Elderf84344f2012-08-31 17:29:51 -0500289 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700290 return -EROFS;
291
Alex Elder340c7a22012-08-10 13:12:07 -0700292 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500293 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 return 0;
296}
297
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800298static int rbd_release(struct gendisk *disk, fmode_t mode)
299{
300 struct rbd_device *rbd_dev = disk->private_data;
301
302 rbd_put_dev(rbd_dev);
303
304 return 0;
305}
306
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307static const struct block_device_operations rbd_bd_ops = {
308 .owner = THIS_MODULE,
309 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800310 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700311};
312
313/*
314 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500315 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316 */
Alex Elderf8c38922012-08-10 13:12:07 -0700317static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318{
319 struct rbd_client *rbdc;
320 int ret = -ENOMEM;
321
322 dout("rbd_client_create\n");
323 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
324 if (!rbdc)
325 goto out_opt;
326
327 kref_init(&rbdc->kref);
328 INIT_LIST_HEAD(&rbdc->node);
329
Alex Elderbc534d82012-01-29 13:57:44 -0600330 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
331
Alex Elder43ae4702012-07-03 16:01:18 -0500332 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600334 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500335 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336
337 ret = ceph_open_session(rbdc->client);
338 if (ret < 0)
339 goto out_err;
340
Alex Elder432b8582012-01-29 13:57:44 -0600341 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700342 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600343 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
Alex Elderbc534d82012-01-29 13:57:44 -0600345 mutex_unlock(&ctl_mutex);
346
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347 dout("rbd_client_create created %p\n", rbdc);
348 return rbdc;
349
350out_err:
351 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600352out_mutex:
353 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354 kfree(rbdc);
355out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500356 if (ceph_opts)
357 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400358 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700359}
360
361/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700362 * Find a ceph client with specific addr and configuration. If
363 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700365static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700366{
367 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700368 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369
Alex Elder43ae4702012-07-03 16:01:18 -0500370 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371 return NULL;
372
Alex Elder1f7ba332012-08-10 13:12:07 -0700373 spin_lock(&rbd_client_list_lock);
374 list_for_each_entry(client_node, &rbd_client_list, node) {
375 if (!ceph_compare_options(ceph_opts, client_node->client)) {
376 kref_get(&client_node->kref);
377 found = true;
378 break;
379 }
380 }
381 spin_unlock(&rbd_client_list_lock);
382
383 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700384}
385
386/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700387 * mount options
388 */
389enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700390 Opt_last_int,
391 /* int args above */
392 Opt_last_string,
393 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700394 Opt_read_only,
395 Opt_read_write,
396 /* Boolean args above */
397 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398};
399
Alex Elder43ae4702012-07-03 16:01:18 -0500400static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700401 /* int args above */
402 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500403 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700404 {Opt_read_only, "ro"}, /* Alternate spelling */
405 {Opt_read_write, "read_write"},
406 {Opt_read_write, "rw"}, /* Alternate spelling */
407 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408 {-1, NULL}
409};
410
411static int parse_rbd_opts_token(char *c, void *private)
412{
Alex Elder43ae4702012-07-03 16:01:18 -0500413 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700414 substring_t argstr[MAX_OPT_ARGS];
415 int token, intval, ret;
416
Alex Elder43ae4702012-07-03 16:01:18 -0500417 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700418 if (token < 0)
419 return -EINVAL;
420
421 if (token < Opt_last_int) {
422 ret = match_int(&argstr[0], &intval);
423 if (ret < 0) {
424 pr_err("bad mount option arg (not int) "
425 "at '%s'\n", c);
426 return ret;
427 }
428 dout("got int token %d val %d\n", token, intval);
429 } else if (token > Opt_last_int && token < Opt_last_string) {
430 dout("got string token %d val %s\n", token,
431 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700432 } else if (token > Opt_last_string && token < Opt_last_bool) {
433 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434 } else {
435 dout("got token %d\n", token);
436 }
437
438 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700439 case Opt_read_only:
440 rbd_opts->read_only = true;
441 break;
442 case Opt_read_write:
443 rbd_opts->read_only = false;
444 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700445 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500446 rbd_assert(false);
447 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700448 }
449 return 0;
450}
451
452/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 * Get a ceph client with specific addr and configuration, if one does
454 * not exist create it.
455 */
Alex Elderf8c38922012-08-10 13:12:07 -0700456static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
457 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458{
Alex Elderf8c38922012-08-10 13:12:07 -0700459 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500460 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700461 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700462
Alex Eldercc0538b2012-08-10 13:12:07 -0700463 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700464
Alex Elder43ae4702012-07-03 16:01:18 -0500465 ceph_opts = ceph_parse_options(options, mon_addr,
466 mon_addr + mon_addr_len,
467 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700468 if (IS_ERR(ceph_opts))
469 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470
Alex Elder1f7ba332012-08-10 13:12:07 -0700471 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700472 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600473 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500474 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700475 } else {
476 rbdc = rbd_client_create(ceph_opts);
477 if (IS_ERR(rbdc))
478 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700479 }
Alex Elderf8c38922012-08-10 13:12:07 -0700480 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481
Alex Elderf8c38922012-08-10 13:12:07 -0700482 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483}
484
485/*
486 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600487 *
Alex Elder432b8582012-01-29 13:57:44 -0600488 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700489 */
490static void rbd_client_release(struct kref *kref)
491{
492 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
493
494 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500495 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700496 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500497 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700498
499 ceph_destroy_client(rbdc->client);
500 kfree(rbdc);
501}
502
503/*
504 * Drop reference to ceph client node. If it's not referenced anymore, release
505 * it.
506 */
507static void rbd_put_client(struct rbd_device *rbd_dev)
508{
509 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
510 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700511}
512
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700513/*
514 * Destroy requests collection
515 */
516static void rbd_coll_release(struct kref *kref)
517{
518 struct rbd_req_coll *coll =
519 container_of(kref, struct rbd_req_coll, kref);
520
521 dout("rbd_coll_release %p\n", coll);
522 kfree(coll);
523}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700524
Alex Eldera30b71b2012-07-10 20:30:11 -0500525static bool rbd_image_format_valid(u32 image_format)
526{
527 return image_format == 1 || image_format == 2;
528}
529
Alex Elder8e94af82012-07-25 09:32:40 -0500530static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
531{
Alex Elder103a1502012-08-02 11:29:45 -0500532 size_t size;
533 u32 snap_count;
534
535 /* The header has to start with the magic rbd header text */
536 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
537 return false;
538
Alex Elderdb2388b2012-10-20 22:17:27 -0500539 /* The bio layer requires at least sector-sized I/O */
540
541 if (ondisk->options.order < SECTOR_SHIFT)
542 return false;
543
544 /* If we use u64 in a few spots we may be able to loosen this */
545
546 if (ondisk->options.order > 8 * sizeof (int) - 1)
547 return false;
548
Alex Elder103a1502012-08-02 11:29:45 -0500549 /*
550 * The size of a snapshot header has to fit in a size_t, and
551 * that limits the number of snapshots.
552 */
553 snap_count = le32_to_cpu(ondisk->snap_count);
554 size = SIZE_MAX - sizeof (struct ceph_snap_context);
555 if (snap_count > size / sizeof (__le64))
556 return false;
557
558 /*
559 * Not only that, but the size of the entire the snapshot
560 * header must also be representable in a size_t.
561 */
562 size -= snap_count * sizeof (__le64);
563 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
564 return false;
565
566 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500567}
568
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700569/*
570 * Create a new header structure, translate header format from the on-disk
571 * header.
572 */
573static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500574 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575{
Alex Elderccece232012-07-10 20:30:10 -0500576 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500577 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500578 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500579 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700580
Alex Elder6a523252012-07-19 17:12:59 -0500581 memset(header, 0, sizeof (*header));
582
Alex Elder103a1502012-08-02 11:29:45 -0500583 snap_count = le32_to_cpu(ondisk->snap_count);
584
Alex Elder58c17b02012-08-23 23:22:06 -0500585 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
586 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500587 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700588 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500589 memcpy(header->object_prefix, ondisk->object_prefix, len);
590 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600591
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500593 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
594
Alex Elder621901d2012-08-23 23:22:06 -0500595 /* Save a copy of the snapshot names */
596
Alex Elderf785cc12012-08-23 23:22:06 -0500597 if (snap_names_len > (u64) SIZE_MAX)
598 return -EIO;
599 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500601 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500602 /*
603 * Note that rbd_dev_v1_header_read() guarantees
604 * the ondisk buffer we're working with has
605 * snap_names_len bytes beyond the end of the
606 * snapshot id array, this memcpy() is safe.
607 */
608 memcpy(header->snap_names, &ondisk->snaps[snap_count],
609 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500610
Alex Elder621901d2012-08-23 23:22:06 -0500611 /* Record each snapshot's size */
612
Alex Elderd2bb24e2012-07-26 23:37:14 -0500613 size = snap_count * sizeof (*header->snap_sizes);
614 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700615 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500616 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500617 for (i = 0; i < snap_count; i++)
618 header->snap_sizes[i] =
619 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700620 } else {
Alex Elderccece232012-07-10 20:30:10 -0500621 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622 header->snap_names = NULL;
623 header->snap_sizes = NULL;
624 }
Alex Elder849b4262012-07-09 21:04:24 -0500625
Alex Elder34b13182012-07-13 20:35:12 -0500626 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700627 header->obj_order = ondisk->options.order;
628 header->crypt_type = ondisk->options.crypt_type;
629 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500630
Alex Elder621901d2012-08-23 23:22:06 -0500631 /* Allocate and fill in the snapshot context */
632
Alex Elderf84344f2012-08-31 17:29:51 -0500633 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500634 size = sizeof (struct ceph_snap_context);
635 size += snap_count * sizeof (header->snapc->snaps[0]);
636 header->snapc = kzalloc(size, GFP_KERNEL);
637 if (!header->snapc)
638 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639
640 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500641 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500643 for (i = 0; i < snap_count; i++)
644 header->snapc->snaps[i] =
645 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646
647 return 0;
648
Alex Elder6a523252012-07-19 17:12:59 -0500649out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500650 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500651 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500653 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500654 kfree(header->object_prefix);
655 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500656
Alex Elder00f1f362012-02-07 12:03:36 -0600657 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658}
659
Alex Elder8836b992012-08-30 14:42:15 -0500660static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700661{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662
Alex Eldere86924a2012-07-10 20:30:11 -0500663 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600664
Alex Eldere86924a2012-07-10 20:30:11 -0500665 list_for_each_entry(snap, &rbd_dev->snaps, node) {
666 if (!strcmp(snap_name, snap->name)) {
667 rbd_dev->mapping.snap_id = snap->id;
668 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500669 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600670
Alex Eldere86924a2012-07-10 20:30:11 -0500671 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600672 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673 }
Alex Eldere86924a2012-07-10 20:30:11 -0500674
Alex Elder00f1f362012-02-07 12:03:36 -0600675 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700676}
677
Alex Elder5ed16172012-08-29 17:11:07 -0500678static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679{
Alex Elder78dc4472012-07-19 08:49:18 -0500680 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681
Alex Elder4e1105a2012-08-31 17:29:52 -0500682 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800683 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500684 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500685 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500686 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500687 rbd_dev->mapping.snap_exists = false;
688 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Alex Eldere86924a2012-07-10 20:30:11 -0500689 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500691 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 if (ret < 0)
693 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500694 rbd_dev->mapping.snap_exists = true;
695 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500697 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700698done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 return ret;
700}
701
702static void rbd_header_free(struct rbd_image_header *header)
703{
Alex Elder849b4262012-07-09 21:04:24 -0500704 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500705 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500707 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500708 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500709 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800710 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500711 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712}
713
Alex Elder65ccfe22012-08-09 10:33:26 -0700714static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700715{
Alex Elder65ccfe22012-08-09 10:33:26 -0700716 char *name;
717 u64 segment;
718 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719
Alex Elder65ccfe22012-08-09 10:33:26 -0700720 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
721 if (!name)
722 return NULL;
723 segment = offset >> rbd_dev->header.obj_order;
724 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
725 rbd_dev->header.object_prefix, segment);
726 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
727 pr_err("error formatting segment name for #%llu (%d)\n",
728 segment, ret);
729 kfree(name);
730 name = NULL;
731 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700732
Alex Elder65ccfe22012-08-09 10:33:26 -0700733 return name;
734}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735
Alex Elder65ccfe22012-08-09 10:33:26 -0700736static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
737{
738 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700739
Alex Elder65ccfe22012-08-09 10:33:26 -0700740 return offset & (segment_size - 1);
741}
742
743static u64 rbd_segment_length(struct rbd_device *rbd_dev,
744 u64 offset, u64 length)
745{
746 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
747
748 offset &= segment_size - 1;
749
Alex Elderaafb2302012-09-06 16:00:54 -0500750 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700751 if (offset + length > segment_size)
752 length = segment_size - offset;
753
754 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700755}
756
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700757static int rbd_get_num_segments(struct rbd_image_header *header,
758 u64 ofs, u64 len)
759{
Alex Elderdf111be2012-08-09 10:33:26 -0700760 u64 start_seg;
761 u64 end_seg;
762
763 if (!len)
764 return 0;
765 if (len - 1 > U64_MAX - ofs)
766 return -ERANGE;
767
768 start_seg = ofs >> header->obj_order;
769 end_seg = (ofs + len - 1) >> header->obj_order;
770
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700771 return end_seg - start_seg + 1;
772}
773
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700774/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700775 * returns the size of an object in the image
776 */
777static u64 rbd_obj_bytes(struct rbd_image_header *header)
778{
779 return 1 << header->obj_order;
780}
781
782/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700783 * bio helpers
784 */
785
786static void bio_chain_put(struct bio *chain)
787{
788 struct bio *tmp;
789
790 while (chain) {
791 tmp = chain;
792 chain = chain->bi_next;
793 bio_put(tmp);
794 }
795}
796
797/*
798 * zeros a bio chain, starting at specific offset
799 */
800static void zero_bio_chain(struct bio *chain, int start_ofs)
801{
802 struct bio_vec *bv;
803 unsigned long flags;
804 void *buf;
805 int i;
806 int pos = 0;
807
808 while (chain) {
809 bio_for_each_segment(bv, chain, i) {
810 if (pos + bv->bv_len > start_ofs) {
811 int remainder = max(start_ofs - pos, 0);
812 buf = bvec_kmap_irq(bv, &flags);
813 memset(buf + remainder, 0,
814 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200815 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700816 }
817 pos += bv->bv_len;
818 }
819
820 chain = chain->bi_next;
821 }
822}
823
824/*
825 * bio_chain_clone - clone a chain of bios up to a certain length.
826 * might return a bio_pair that will need to be released.
827 */
828static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
829 struct bio_pair **bp,
830 int len, gfp_t gfpmask)
831{
Alex Elder542582f2012-08-09 10:33:25 -0700832 struct bio *old_chain = *old;
833 struct bio *new_chain = NULL;
834 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700835 int total = 0;
836
837 if (*bp) {
838 bio_pair_release(*bp);
839 *bp = NULL;
840 }
841
842 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700843 struct bio *tmp;
844
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
846 if (!tmp)
847 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700848 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700849
850 if (total + old_chain->bi_size > len) {
851 struct bio_pair *bp;
852
853 /*
854 * this split can only happen with a single paged bio,
855 * split_bio will BUG_ON if this is not the case
856 */
857 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500858 "bi_size=%u\n",
859 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700860
861 /* split the bio. We'll release it either in the next
862 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600863 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700864 if (!bp)
865 goto err_out;
866
867 __bio_clone(tmp, &bp->bio1);
868
869 *next = &bp->bio2;
870 } else {
871 __bio_clone(tmp, old_chain);
872 *next = old_chain->bi_next;
873 }
874
875 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700877 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700879 else
880 new_chain = tmp;
881 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882 old_chain = old_chain->bi_next;
883
884 total += tmp->bi_size;
885 }
886
Alex Elderaafb2302012-09-06 16:00:54 -0500887 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889 *old = old_chain;
890
891 return new_chain;
892
893err_out:
894 dout("bio_chain_clone with err\n");
895 bio_chain_put(new_chain);
896 return NULL;
897}
898
899/*
900 * helpers for osd request op vectors.
901 */
Alex Elder57cfc102012-06-26 12:57:03 -0700902static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
903 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700904{
Alex Elder57cfc102012-06-26 12:57:03 -0700905 struct ceph_osd_req_op *ops;
906
907 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
908 if (!ops)
909 return NULL;
910
911 ops[0].op = opcode;
912
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700913 /*
914 * op extent offset and length will be set later on
915 * in calc_raw_layout()
916 */
Alex Elder57cfc102012-06-26 12:57:03 -0700917 ops[0].payload_len = payload_len;
918
919 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700920}
921
922static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
923{
924 kfree(ops);
925}
926
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700927static void rbd_coll_end_req_index(struct request *rq,
928 struct rbd_req_coll *coll,
929 int index,
930 int ret, u64 len)
931{
932 struct request_queue *q;
933 int min, max, i;
934
Alex Elderbd919d42012-07-13 20:35:11 -0500935 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
936 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700937
938 if (!rq)
939 return;
940
941 if (!coll) {
942 blk_end_request(rq, ret, len);
943 return;
944 }
945
946 q = rq->q;
947
948 spin_lock_irq(q->queue_lock);
949 coll->status[index].done = 1;
950 coll->status[index].rc = ret;
951 coll->status[index].bytes = len;
952 max = min = coll->num_done;
953 while (max < coll->total && coll->status[max].done)
954 max++;
955
956 for (i = min; i<max; i++) {
957 __blk_end_request(rq, coll->status[i].rc,
958 coll->status[i].bytes);
959 coll->num_done++;
960 kref_put(&coll->kref, rbd_coll_release);
961 }
962 spin_unlock_irq(q->queue_lock);
963}
964
965static void rbd_coll_end_req(struct rbd_request *req,
966 int ret, u64 len)
967{
968 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
969}
970
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700971/*
972 * Send ceph osd request
973 */
974static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500975 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700976 struct ceph_snap_context *snapc,
977 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500978 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700979 struct bio *bio,
980 struct page **pages,
981 int num_pages,
982 int flags,
983 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700984 struct rbd_req_coll *coll,
985 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700986 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700987 struct ceph_msg *msg),
988 struct ceph_osd_request **linger_req,
989 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700990{
991 struct ceph_osd_request *req;
992 struct ceph_file_layout *layout;
993 int ret;
994 u64 bno;
995 struct timespec mtime = CURRENT_TIME;
996 struct rbd_request *req_data;
997 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600998 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001001 if (!req_data) {
1002 if (coll)
1003 rbd_coll_end_req_index(rq, coll, coll_index,
1004 -ENOMEM, len);
1005 return -ENOMEM;
1006 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001007
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001008 if (coll) {
1009 req_data->coll = coll;
1010 req_data->coll_index = coll_index;
1011 }
1012
Alex Elderbd919d42012-07-13 20:35:11 -05001013 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
1014 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015
Alex Elder0ce1a792012-07-03 16:01:18 -05001016 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001017 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1018 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001019 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001020 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001021 goto done_pages;
1022 }
1023
1024 req->r_callback = rbd_cb;
1025
1026 req_data->rq = rq;
1027 req_data->bio = bio;
1028 req_data->pages = pages;
1029 req_data->len = len;
1030
1031 req->r_priv = req_data;
1032
1033 reqhead = req->r_request->front.iov_base;
1034 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1035
Alex Elderaded07e2012-07-03 16:01:18 -05001036 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001037 req->r_oid_len = strlen(req->r_oid);
1038
1039 layout = &req->r_file_layout;
1040 memset(layout, 0, sizeof(*layout));
1041 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1042 layout->fl_stripe_count = cpu_to_le32(1);
1043 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001044 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001045 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1046 req, ops);
1047 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001048
1049 ceph_osdc_build_request(req, ofs, &len,
1050 ops,
1051 snapc,
1052 &mtime,
1053 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001055 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001056 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001057 *linger_req = req;
1058 }
1059
Alex Elder1dbb4392012-01-24 10:08:37 -06001060 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001061 if (ret < 0)
1062 goto done_err;
1063
1064 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001065 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001066 if (ver)
1067 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001068 dout("reassert_ver=%llu\n",
1069 (unsigned long long)
1070 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001071 ceph_osdc_put_request(req);
1072 }
1073 return ret;
1074
1075done_err:
1076 bio_chain_put(req_data->bio);
1077 ceph_osdc_put_request(req);
1078done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001079 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001081 return ret;
1082}
1083
1084/*
1085 * Ceph osd op callback
1086 */
1087static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1088{
1089 struct rbd_request *req_data = req->r_priv;
1090 struct ceph_osd_reply_head *replyhead;
1091 struct ceph_osd_op *op;
1092 __s32 rc;
1093 u64 bytes;
1094 int read_op;
1095
1096 /* parse reply */
1097 replyhead = msg->front.iov_base;
1098 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1099 op = (void *)(replyhead + 1);
1100 rc = le32_to_cpu(replyhead->result);
1101 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001102 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103
Alex Elderbd919d42012-07-13 20:35:11 -05001104 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1105 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106
1107 if (rc == -ENOENT && read_op) {
1108 zero_bio_chain(req_data->bio, 0);
1109 rc = 0;
1110 } else if (rc == 0 && read_op && bytes < req_data->len) {
1111 zero_bio_chain(req_data->bio, bytes);
1112 bytes = req_data->len;
1113 }
1114
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001115 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001116
1117 if (req_data->bio)
1118 bio_chain_put(req_data->bio);
1119
1120 ceph_osdc_put_request(req);
1121 kfree(req_data);
1122}
1123
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001124static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1125{
1126 ceph_osdc_put_request(req);
1127}
1128
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129/*
1130 * Do a synchronous ceph osd operation
1131 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001132static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133 struct ceph_snap_context *snapc,
1134 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001136 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001137 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001138 u64 ofs, u64 inbound_size,
1139 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001140 struct ceph_osd_request **linger_req,
1141 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001142{
1143 int ret;
1144 struct page **pages;
1145 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001146
Alex Elderaafb2302012-09-06 16:00:54 -05001147 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148
Alex Elderf8d4de62012-07-03 16:01:19 -05001149 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001150 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001151 if (IS_ERR(pages))
1152 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153
Alex Elder0ce1a792012-07-03 16:01:18 -05001154 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001155 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001156 pages, num_pages,
1157 flags,
1158 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001159 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001160 NULL,
1161 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001163 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164
Alex Elderf8d4de62012-07-03 16:01:19 -05001165 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1166 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168done:
1169 ceph_release_page_vector(pages, num_pages);
1170 return ret;
1171}
1172
1173/*
1174 * Do an asynchronous ceph osd operation
1175 */
1176static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001177 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001180 struct bio *bio,
1181 struct rbd_req_coll *coll,
1182 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001183{
1184 char *seg_name;
1185 u64 seg_ofs;
1186 u64 seg_len;
1187 int ret;
1188 struct ceph_osd_req_op *ops;
1189 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001190 int opcode;
1191 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001192 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001193
Alex Elder65ccfe22012-08-09 10:33:26 -07001194 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001195 if (!seg_name)
1196 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001197 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1198 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001199
Alex Elderff2e4bb2012-10-10 18:59:29 -07001200 if (rq_data_dir(rq) == WRITE) {
1201 opcode = CEPH_OSD_OP_WRITE;
1202 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001203 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001204 payload_len = seg_len;
1205 } else {
1206 opcode = CEPH_OSD_OP_READ;
1207 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001208 snapc = NULL;
1209 snapid = rbd_dev->mapping.snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001210 payload_len = 0;
1211 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212
Alex Elder57cfc102012-06-26 12:57:03 -07001213 ret = -ENOMEM;
1214 ops = rbd_create_rw_ops(1, opcode, payload_len);
1215 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001216 goto done;
1217
1218 /* we've taken care of segment sizes earlier when we
1219 cloned the bios. We should never have a segment
1220 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001221 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222
1223 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1224 seg_name, seg_ofs, seg_len,
1225 bio,
1226 NULL, 0,
1227 flags,
1228 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001229 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001231
1232 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001233done:
1234 kfree(seg_name);
1235 return ret;
1236}
1237
1238/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239 * Request sync osd read
1240 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001241static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001243 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001244 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001245 char *buf,
1246 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001247{
Alex Elder913d2fd2012-06-26 12:57:03 -07001248 struct ceph_osd_req_op *ops;
1249 int ret;
1250
1251 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1252 if (!ops)
1253 return -ENOMEM;
1254
1255 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001256 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001257 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001258 ops, object_name, ofs, len, buf, NULL, ver);
1259 rbd_destroy_ops(ops);
1260
1261 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001262}
1263
1264/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001265 * Request sync osd watch
1266 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001267static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001268 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001269 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001270{
1271 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001272 int ret;
1273
Alex Elder57cfc102012-06-26 12:57:03 -07001274 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1275 if (!ops)
1276 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001277
Josh Durgina71b8912011-12-05 18:10:44 -08001278 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001279 ops[0].watch.cookie = notify_id;
1280 ops[0].watch.flag = 0;
1281
Alex Elder0ce1a792012-07-03 16:01:18 -05001282 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001283 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001284 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001285 CEPH_OSD_FLAG_READ,
1286 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001287 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001288 rbd_simple_req_cb, 0, NULL);
1289
1290 rbd_destroy_ops(ops);
1291 return ret;
1292}
1293
1294static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1295{
Alex Elder0ce1a792012-07-03 16:01:18 -05001296 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001297 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001298 int rc;
1299
Alex Elder0ce1a792012-07-03 16:01:18 -05001300 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001301 return;
1302
Alex Elderbd919d42012-07-13 20:35:11 -05001303 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1304 rbd_dev->header_name, (unsigned long long) notify_id,
1305 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001306 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001307 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001308 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001309 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310
Alex Elder7f0a24d2012-07-25 09:32:40 -05001311 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001312}
1313
1314/*
1315 * Request sync osd watch
1316 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001317static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001318{
1319 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001320 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001321 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001322
Alex Elder57cfc102012-06-26 12:57:03 -07001323 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1324 if (!ops)
1325 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001326
1327 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001328 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001329 if (ret < 0)
1330 goto fail;
1331
Alex Elder0e6f3222012-07-25 09:32:40 -05001332 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001333 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001334 ops[0].watch.flag = 1;
1335
Alex Elder0ce1a792012-07-03 16:01:18 -05001336 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001337 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1339 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001340 rbd_dev->header_name,
1341 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001342 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343
1344 if (ret < 0)
1345 goto fail_event;
1346
1347 rbd_destroy_ops(ops);
1348 return 0;
1349
1350fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001351 ceph_osdc_cancel_event(rbd_dev->watch_event);
1352 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001353fail:
1354 rbd_destroy_ops(ops);
1355 return ret;
1356}
1357
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001358/*
1359 * Request sync osd unwatch
1360 */
Alex Elder070c6332012-07-25 09:32:41 -05001361static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001362{
1363 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001364 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001365
Alex Elder57cfc102012-06-26 12:57:03 -07001366 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1367 if (!ops)
1368 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001369
1370 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001371 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001372 ops[0].watch.flag = 0;
1373
Alex Elder0ce1a792012-07-03 16:01:18 -05001374 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001375 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001376 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1377 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001378 rbd_dev->header_name,
1379 0, 0, NULL, NULL, NULL);
1380
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001381
1382 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001383 ceph_osdc_cancel_event(rbd_dev->watch_event);
1384 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001385 return ret;
1386}
1387
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001388/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001389 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001390 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001391static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001392 const char *object_name,
1393 const char *class_name,
1394 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001395 const char *outbound,
1396 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001397 char *inbound,
1398 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001399 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001400 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001401{
1402 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001403 int class_name_len = strlen(class_name);
1404 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001405 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001406 int ret;
1407
Alex Elder3cb4a682012-06-26 12:57:03 -07001408 /*
1409 * Any input parameters required by the method we're calling
1410 * will be sent along with the class and method names as
1411 * part of the message payload. That data and its size are
1412 * supplied via the indata and indata_len fields (named from
1413 * the perspective of the server side) in the OSD request
1414 * operation.
1415 */
1416 payload_size = class_name_len + method_name_len + outbound_size;
1417 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001418 if (!ops)
1419 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001420
Alex Elderaded07e2012-07-03 16:01:18 -05001421 ops[0].cls.class_name = class_name;
1422 ops[0].cls.class_len = (__u8) class_name_len;
1423 ops[0].cls.method_name = method_name;
1424 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001425 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001426 ops[0].cls.indata = outbound;
1427 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001428
Alex Elder0ce1a792012-07-03 16:01:18 -05001429 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001430 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001431 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001432 object_name, 0, inbound_size, inbound,
1433 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001434
1435 rbd_destroy_ops(ops);
1436
1437 dout("cls_exec returned %d\n", ret);
1438 return ret;
1439}
1440
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001441static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1442{
1443 struct rbd_req_coll *coll =
1444 kzalloc(sizeof(struct rbd_req_coll) +
1445 sizeof(struct rbd_req_status) * num_reqs,
1446 GFP_ATOMIC);
1447
1448 if (!coll)
1449 return NULL;
1450 coll->total = num_reqs;
1451 kref_init(&coll->kref);
1452 return coll;
1453}
1454
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001455/*
1456 * block device queue callback
1457 */
1458static void rbd_rq_fn(struct request_queue *q)
1459{
1460 struct rbd_device *rbd_dev = q->queuedata;
1461 struct request *rq;
1462 struct bio_pair *bp = NULL;
1463
Alex Elder00f1f362012-02-07 12:03:36 -06001464 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001465 struct bio *bio;
1466 struct bio *rq_bio, *next_bio = NULL;
1467 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001468 unsigned int size;
1469 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001470 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001471 int num_segs, cur_seg = 0;
1472 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001473 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001475 dout("fetched request\n");
1476
1477 /* filter out block requests we don't understand */
1478 if ((rq->cmd_type != REQ_TYPE_FS)) {
1479 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001480 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001481 }
1482
1483 /* deduce our operation (read, write) */
1484 do_write = (rq_data_dir(rq) == WRITE);
1485
1486 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001487 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001488 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001489 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001491 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001492 }
1493
1494 spin_unlock_irq(q->queue_lock);
1495
Josh Durgind1d25642011-12-05 14:03:05 -08001496 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001497
Alex Elderf84344f2012-08-31 17:29:51 -05001498 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1499 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001500 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001501 dout("request for non-existent snapshot");
1502 spin_lock_irq(q->queue_lock);
1503 __blk_end_request_all(rq, -ENXIO);
1504 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001505 }
1506
Josh Durgind1d25642011-12-05 14:03:05 -08001507 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1508
1509 up_read(&rbd_dev->header_rwsem);
1510
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511 dout("%s 0x%x bytes at 0x%llx\n",
1512 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001513 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001514
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001515 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001516 if (num_segs <= 0) {
1517 spin_lock_irq(q->queue_lock);
1518 __blk_end_request_all(rq, num_segs);
1519 ceph_put_snap_context(snapc);
1520 continue;
1521 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001522 coll = rbd_alloc_coll(num_segs);
1523 if (!coll) {
1524 spin_lock_irq(q->queue_lock);
1525 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001526 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001527 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001528 }
1529
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 do {
1531 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001532 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001533 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001534 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1536 op_size, GFP_ATOMIC);
Alex Elder46342462012-10-10 18:59:29 -07001537 if (bio)
1538 (void) rbd_do_op(rq, rbd_dev, snapc,
1539 ofs, op_size,
1540 bio, coll, cur_seg);
1541 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001542 rbd_coll_end_req_index(rq, coll, cur_seg,
1543 -ENOMEM, op_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001544 size -= op_size;
1545 ofs += op_size;
1546
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001547 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 rq_bio = next_bio;
1549 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001550 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001551
1552 if (bp)
1553 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001554 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001555
1556 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001557 }
1558}
1559
1560/*
1561 * a queue callback. Makes sure that we don't create a bio that spans across
1562 * multiple osd objects. One exception would be with a single page bios,
1563 * which we handle later at bio_chain_clone
1564 */
1565static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1566 struct bio_vec *bvec)
1567{
1568 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001569 sector_t sector_offset;
1570 sector_t sectors_per_obj;
1571 sector_t obj_sector_offset;
1572 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001573
Alex Eldere5cfeed22012-10-20 22:17:27 -05001574 /*
1575 * Find how far into its rbd object the partition-relative
1576 * bio start sector is to offset relative to the enclosing
1577 * device.
1578 */
1579 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1580 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1581 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001582
Alex Eldere5cfeed22012-10-20 22:17:27 -05001583 /*
1584 * Compute the number of bytes from that offset to the end
1585 * of the object. Account for what's already used by the bio.
1586 */
1587 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1588 if (ret > bmd->bi_size)
1589 ret -= bmd->bi_size;
1590 else
1591 ret = 0;
1592
1593 /*
1594 * Don't send back more than was asked for. And if the bio
1595 * was empty, let the whole thing through because: "Note
1596 * that a block device *must* allow a single page to be
1597 * added to an empty bio."
1598 */
1599 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1600 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1601 ret = (int) bvec->bv_len;
1602
1603 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001604}
1605
1606static void rbd_free_disk(struct rbd_device *rbd_dev)
1607{
1608 struct gendisk *disk = rbd_dev->disk;
1609
1610 if (!disk)
1611 return;
1612
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001613 if (disk->flags & GENHD_FL_UP)
1614 del_gendisk(disk);
1615 if (disk->queue)
1616 blk_cleanup_queue(disk->queue);
1617 put_disk(disk);
1618}
1619
1620/*
Alex Elder4156d992012-08-02 11:29:46 -05001621 * Read the complete header for the given rbd device.
1622 *
1623 * Returns a pointer to a dynamically-allocated buffer containing
1624 * the complete and validated header. Caller can pass the address
1625 * of a variable that will be filled in with the version of the
1626 * header object at the time it was read.
1627 *
1628 * Returns a pointer-coded errno if a failure occurs.
1629 */
1630static struct rbd_image_header_ondisk *
1631rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1632{
1633 struct rbd_image_header_ondisk *ondisk = NULL;
1634 u32 snap_count = 0;
1635 u64 names_size = 0;
1636 u32 want_count;
1637 int ret;
1638
1639 /*
1640 * The complete header will include an array of its 64-bit
1641 * snapshot ids, followed by the names of those snapshots as
1642 * a contiguous block of NUL-terminated strings. Note that
1643 * the number of snapshots could change by the time we read
1644 * it in, in which case we re-read it.
1645 */
1646 do {
1647 size_t size;
1648
1649 kfree(ondisk);
1650
1651 size = sizeof (*ondisk);
1652 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1653 size += names_size;
1654 ondisk = kmalloc(size, GFP_KERNEL);
1655 if (!ondisk)
1656 return ERR_PTR(-ENOMEM);
1657
1658 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1659 rbd_dev->header_name,
1660 0, size,
1661 (char *) ondisk, version);
1662
1663 if (ret < 0)
1664 goto out_err;
1665 if (WARN_ON((size_t) ret < size)) {
1666 ret = -ENXIO;
1667 pr_warning("short header read for image %s"
1668 " (want %zd got %d)\n",
1669 rbd_dev->image_name, size, ret);
1670 goto out_err;
1671 }
1672 if (!rbd_dev_ondisk_valid(ondisk)) {
1673 ret = -ENXIO;
1674 pr_warning("invalid header for image %s\n",
1675 rbd_dev->image_name);
1676 goto out_err;
1677 }
1678
1679 names_size = le64_to_cpu(ondisk->snap_names_len);
1680 want_count = snap_count;
1681 snap_count = le32_to_cpu(ondisk->snap_count);
1682 } while (snap_count != want_count);
1683
1684 return ondisk;
1685
1686out_err:
1687 kfree(ondisk);
1688
1689 return ERR_PTR(ret);
1690}
1691
1692/*
1693 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001694 */
1695static int rbd_read_header(struct rbd_device *rbd_dev,
1696 struct rbd_image_header *header)
1697{
Alex Elder4156d992012-08-02 11:29:46 -05001698 struct rbd_image_header_ondisk *ondisk;
1699 u64 ver = 0;
1700 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001701
Alex Elder4156d992012-08-02 11:29:46 -05001702 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1703 if (IS_ERR(ondisk))
1704 return PTR_ERR(ondisk);
1705 ret = rbd_header_from_disk(header, ondisk);
1706 if (ret >= 0)
1707 header->obj_version = ver;
1708 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001709
Alex Elder4156d992012-08-02 11:29:46 -05001710 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001711}
1712
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001713static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1714{
1715 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001716 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001717
Alex Eldera0593292012-07-19 09:09:27 -05001718 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001719 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001720}
1721
Alex Elder94785542012-10-09 13:50:17 -07001722static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1723{
1724 sector_t size;
1725
1726 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
1727 return;
1728
1729 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1730 dout("setting size to %llu sectors", (unsigned long long) size);
1731 rbd_dev->mapping.size = (u64) size;
1732 set_capacity(rbd_dev->disk, size);
1733}
1734
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001735/*
1736 * only read the first part of the ondisk header, without the snaps info
1737 */
Alex Elder117973f2012-08-31 17:29:55 -05001738static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001739{
1740 int ret;
1741 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001742
1743 ret = rbd_read_header(rbd_dev, &h);
1744 if (ret < 0)
1745 return ret;
1746
Josh Durgina51aa0c2011-12-05 10:35:04 -08001747 down_write(&rbd_dev->header_rwsem);
1748
Alex Elder94785542012-10-09 13:50:17 -07001749 /* Update image size, and check for resize of mapped image */
1750 rbd_dev->header.image_size = h.image_size;
1751 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001752
Alex Elder849b4262012-07-09 21:04:24 -05001753 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001754 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001755 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001756 /* osd requests may still refer to snapc */
1757 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001758
Alex Elderb8136232012-07-25 09:32:41 -05001759 if (hver)
1760 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001761 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001762 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001763 rbd_dev->header.snapc = h.snapc;
1764 rbd_dev->header.snap_names = h.snap_names;
1765 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001766 /* Free the extra copy of the object prefix */
1767 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1768 kfree(h.object_prefix);
1769
Alex Elder304f6802012-08-31 17:29:52 -05001770 ret = rbd_dev_snaps_update(rbd_dev);
1771 if (!ret)
1772 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001773
Josh Durginc6666012011-11-21 17:11:12 -08001774 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001776 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001777}
1778
Alex Elder117973f2012-08-31 17:29:55 -05001779static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001780{
1781 int ret;
1782
Alex Elder117973f2012-08-31 17:29:55 -05001783 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001784 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001785 if (rbd_dev->image_format == 1)
1786 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1787 else
1788 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001789 mutex_unlock(&ctl_mutex);
1790
1791 return ret;
1792}
1793
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001794static int rbd_init_disk(struct rbd_device *rbd_dev)
1795{
1796 struct gendisk *disk;
1797 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001798 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001800 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001801 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1802 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001803 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001804
Alex Elderf0f8cef2012-01-29 13:57:44 -06001805 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001806 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001807 disk->major = rbd_dev->major;
1808 disk->first_minor = 0;
1809 disk->fops = &rbd_bd_ops;
1810 disk->private_data = rbd_dev;
1811
1812 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1814 if (!q)
1815 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001816
Alex Elder593a9e72012-02-07 12:03:37 -06001817 /* We use the default size, but let's be explicit about it. */
1818 blk_queue_physical_block_size(q, SECTOR_SIZE);
1819
Josh Durgin029bcbd2011-07-22 11:35:23 -07001820 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001821 segment_size = rbd_obj_bytes(&rbd_dev->header);
1822 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1823 blk_queue_max_segment_size(q, segment_size);
1824 blk_queue_io_min(q, segment_size);
1825 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001826
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001827 blk_queue_merge_bvec(q, rbd_merge_bvec);
1828 disk->queue = q;
1829
1830 q->queuedata = rbd_dev;
1831
1832 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001833
Alex Elder12f02942012-08-29 17:11:07 -05001834 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1835
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001836 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001837out_disk:
1838 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001839
1840 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001841}
1842
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001843/*
1844 sysfs
1845*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001846
Alex Elder593a9e72012-02-07 12:03:37 -06001847static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1848{
1849 return container_of(dev, struct rbd_device, dev);
1850}
1851
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001852static ssize_t rbd_size_show(struct device *dev,
1853 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001854{
Alex Elder593a9e72012-02-07 12:03:37 -06001855 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001856 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001857
Josh Durgina51aa0c2011-12-05 10:35:04 -08001858 down_read(&rbd_dev->header_rwsem);
1859 size = get_capacity(rbd_dev->disk);
1860 up_read(&rbd_dev->header_rwsem);
1861
1862 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001863}
1864
Alex Elder34b13182012-07-13 20:35:12 -05001865/*
1866 * Note this shows the features for whatever's mapped, which is not
1867 * necessarily the base image.
1868 */
1869static ssize_t rbd_features_show(struct device *dev,
1870 struct device_attribute *attr, char *buf)
1871{
1872 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1873
1874 return sprintf(buf, "0x%016llx\n",
1875 (unsigned long long) rbd_dev->mapping.features);
1876}
1877
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001878static ssize_t rbd_major_show(struct device *dev,
1879 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001880{
Alex Elder593a9e72012-02-07 12:03:37 -06001881 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001882
1883 return sprintf(buf, "%d\n", rbd_dev->major);
1884}
1885
1886static ssize_t rbd_client_id_show(struct device *dev,
1887 struct device_attribute *attr, char *buf)
1888{
Alex Elder593a9e72012-02-07 12:03:37 -06001889 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001890
Alex Elder1dbb4392012-01-24 10:08:37 -06001891 return sprintf(buf, "client%lld\n",
1892 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001893}
1894
1895static ssize_t rbd_pool_show(struct device *dev,
1896 struct device_attribute *attr, char *buf)
1897{
Alex Elder593a9e72012-02-07 12:03:37 -06001898 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001899
1900 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1901}
1902
Alex Elder9bb2f332012-07-12 10:46:35 -05001903static ssize_t rbd_pool_id_show(struct device *dev,
1904 struct device_attribute *attr, char *buf)
1905{
1906 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1907
1908 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1909}
1910
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001911static ssize_t rbd_name_show(struct device *dev,
1912 struct device_attribute *attr, char *buf)
1913{
Alex Elder593a9e72012-02-07 12:03:37 -06001914 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001915
Alex Elder0bed54d2012-07-03 16:01:18 -05001916 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917}
1918
Alex Elder589d30e2012-07-10 20:30:11 -05001919static ssize_t rbd_image_id_show(struct device *dev,
1920 struct device_attribute *attr, char *buf)
1921{
1922 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1923
1924 return sprintf(buf, "%s\n", rbd_dev->image_id);
1925}
1926
Alex Elder34b13182012-07-13 20:35:12 -05001927/*
1928 * Shows the name of the currently-mapped snapshot (or
1929 * RBD_SNAP_HEAD_NAME for the base image).
1930 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001931static ssize_t rbd_snap_show(struct device *dev,
1932 struct device_attribute *attr,
1933 char *buf)
1934{
Alex Elder593a9e72012-02-07 12:03:37 -06001935 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001936
Alex Elderf84344f2012-08-31 17:29:51 -05001937 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001938}
1939
1940static ssize_t rbd_image_refresh(struct device *dev,
1941 struct device_attribute *attr,
1942 const char *buf,
1943 size_t size)
1944{
Alex Elder593a9e72012-02-07 12:03:37 -06001945 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001946 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001947
Alex Elder117973f2012-08-31 17:29:55 -05001948 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001949
1950 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001951}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001952
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001953static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05001954static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001955static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1956static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1957static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001958static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001959static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05001960static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001961static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1962static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001963
1964static struct attribute *rbd_attrs[] = {
1965 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05001966 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001967 &dev_attr_major.attr,
1968 &dev_attr_client_id.attr,
1969 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001970 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001971 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05001972 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001973 &dev_attr_current_snap.attr,
1974 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001975 NULL
1976};
1977
1978static struct attribute_group rbd_attr_group = {
1979 .attrs = rbd_attrs,
1980};
1981
1982static const struct attribute_group *rbd_attr_groups[] = {
1983 &rbd_attr_group,
1984 NULL
1985};
1986
1987static void rbd_sysfs_dev_release(struct device *dev)
1988{
1989}
1990
1991static struct device_type rbd_device_type = {
1992 .name = "rbd",
1993 .groups = rbd_attr_groups,
1994 .release = rbd_sysfs_dev_release,
1995};
1996
1997
1998/*
1999 sysfs - snapshots
2000*/
2001
2002static ssize_t rbd_snap_size_show(struct device *dev,
2003 struct device_attribute *attr,
2004 char *buf)
2005{
2006 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2007
Josh Durgin35915382011-12-05 18:25:13 -08002008 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002009}
2010
2011static ssize_t rbd_snap_id_show(struct device *dev,
2012 struct device_attribute *attr,
2013 char *buf)
2014{
2015 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2016
Josh Durgin35915382011-12-05 18:25:13 -08002017 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002018}
2019
Alex Elder34b13182012-07-13 20:35:12 -05002020static ssize_t rbd_snap_features_show(struct device *dev,
2021 struct device_attribute *attr,
2022 char *buf)
2023{
2024 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2025
2026 return sprintf(buf, "0x%016llx\n",
2027 (unsigned long long) snap->features);
2028}
2029
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002030static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2031static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002032static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002033
2034static struct attribute *rbd_snap_attrs[] = {
2035 &dev_attr_snap_size.attr,
2036 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002037 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002038 NULL,
2039};
2040
2041static struct attribute_group rbd_snap_attr_group = {
2042 .attrs = rbd_snap_attrs,
2043};
2044
2045static void rbd_snap_dev_release(struct device *dev)
2046{
2047 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2048 kfree(snap->name);
2049 kfree(snap);
2050}
2051
2052static const struct attribute_group *rbd_snap_attr_groups[] = {
2053 &rbd_snap_attr_group,
2054 NULL
2055};
2056
2057static struct device_type rbd_snap_device_type = {
2058 .groups = rbd_snap_attr_groups,
2059 .release = rbd_snap_dev_release,
2060};
2061
Alex Elder304f6802012-08-31 17:29:52 -05002062static bool rbd_snap_registered(struct rbd_snap *snap)
2063{
2064 bool ret = snap->dev.type == &rbd_snap_device_type;
2065 bool reg = device_is_registered(&snap->dev);
2066
2067 rbd_assert(!ret ^ reg);
2068
2069 return ret;
2070}
2071
Alex Elder14e70852012-07-19 09:09:27 -05002072static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002073{
2074 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002075 if (device_is_registered(&snap->dev))
2076 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002077}
2078
Alex Elder14e70852012-07-19 09:09:27 -05002079static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002080 struct device *parent)
2081{
2082 struct device *dev = &snap->dev;
2083 int ret;
2084
2085 dev->type = &rbd_snap_device_type;
2086 dev->parent = parent;
2087 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002088 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002089 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2090
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002091 ret = device_register(dev);
2092
2093 return ret;
2094}
2095
Alex Elder4e891e02012-07-10 20:30:10 -05002096static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002097 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002098 u64 snap_id, u64 snap_size,
2099 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002100{
Alex Elder4e891e02012-07-10 20:30:10 -05002101 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002102 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002103
2104 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002106 return ERR_PTR(-ENOMEM);
2107
2108 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002109 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002110 if (!snap->name)
2111 goto err;
2112
Alex Elderc8d18422012-07-10 20:30:11 -05002113 snap->id = snap_id;
2114 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002115 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002116
2117 return snap;
2118
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119err:
2120 kfree(snap->name);
2121 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002122
2123 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002124}
2125
Alex Eldercd892122012-07-03 16:01:19 -05002126static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2127 u64 *snap_size, u64 *snap_features)
2128{
2129 char *snap_name;
2130
2131 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2132
2133 *snap_size = rbd_dev->header.snap_sizes[which];
2134 *snap_features = 0; /* No features for v1 */
2135
2136 /* Skip over names until we find the one we are looking for */
2137
2138 snap_name = rbd_dev->header.snap_names;
2139 while (which--)
2140 snap_name += strlen(snap_name) + 1;
2141
2142 return snap_name;
2143}
2144
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002145/*
Alex Elder9d475de2012-07-03 16:01:19 -05002146 * Get the size and object order for an image snapshot, or if
2147 * snap_id is CEPH_NOSNAP, gets this information for the base
2148 * image.
2149 */
2150static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2151 u8 *order, u64 *snap_size)
2152{
2153 __le64 snapid = cpu_to_le64(snap_id);
2154 int ret;
2155 struct {
2156 u8 order;
2157 __le64 size;
2158 } __attribute__ ((packed)) size_buf = { 0 };
2159
2160 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2161 "rbd", "get_size",
2162 (char *) &snapid, sizeof (snapid),
2163 (char *) &size_buf, sizeof (size_buf),
2164 CEPH_OSD_FLAG_READ, NULL);
2165 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2166 if (ret < 0)
2167 return ret;
2168
2169 *order = size_buf.order;
2170 *snap_size = le64_to_cpu(size_buf.size);
2171
2172 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2173 (unsigned long long) snap_id, (unsigned int) *order,
2174 (unsigned long long) *snap_size);
2175
2176 return 0;
2177}
2178
2179static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2180{
2181 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2182 &rbd_dev->header.obj_order,
2183 &rbd_dev->header.image_size);
2184}
2185
Alex Elder1e130192012-07-03 16:01:19 -05002186static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2187{
2188 void *reply_buf;
2189 int ret;
2190 void *p;
2191
2192 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2193 if (!reply_buf)
2194 return -ENOMEM;
2195
2196 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2197 "rbd", "get_object_prefix",
2198 NULL, 0,
2199 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2200 CEPH_OSD_FLAG_READ, NULL);
2201 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2202 if (ret < 0)
2203 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002204 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002205
2206 p = reply_buf;
2207 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2208 p + RBD_OBJ_PREFIX_LEN_MAX,
2209 NULL, GFP_NOIO);
2210
2211 if (IS_ERR(rbd_dev->header.object_prefix)) {
2212 ret = PTR_ERR(rbd_dev->header.object_prefix);
2213 rbd_dev->header.object_prefix = NULL;
2214 } else {
2215 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2216 }
2217
2218out:
2219 kfree(reply_buf);
2220
2221 return ret;
2222}
2223
Alex Elderb1b54022012-07-03 16:01:19 -05002224static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2225 u64 *snap_features)
2226{
2227 __le64 snapid = cpu_to_le64(snap_id);
2228 struct {
2229 __le64 features;
2230 __le64 incompat;
2231 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002232 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002233 int ret;
2234
2235 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2236 "rbd", "get_features",
2237 (char *) &snapid, sizeof (snapid),
2238 (char *) &features_buf, sizeof (features_buf),
2239 CEPH_OSD_FLAG_READ, NULL);
2240 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2241 if (ret < 0)
2242 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002243
2244 incompat = le64_to_cpu(features_buf.incompat);
2245 if (incompat & ~RBD_FEATURES_ALL)
2246 return -ENOTSUPP;
2247
Alex Elderb1b54022012-07-03 16:01:19 -05002248 *snap_features = le64_to_cpu(features_buf.features);
2249
2250 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2251 (unsigned long long) snap_id,
2252 (unsigned long long) *snap_features,
2253 (unsigned long long) le64_to_cpu(features_buf.incompat));
2254
2255 return 0;
2256}
2257
2258static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2259{
2260 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2261 &rbd_dev->header.features);
2262}
2263
Alex Elder6e14b1a2012-07-03 16:01:19 -05002264static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002265{
2266 size_t size;
2267 int ret;
2268 void *reply_buf;
2269 void *p;
2270 void *end;
2271 u64 seq;
2272 u32 snap_count;
2273 struct ceph_snap_context *snapc;
2274 u32 i;
2275
2276 /*
2277 * We'll need room for the seq value (maximum snapshot id),
2278 * snapshot count, and array of that many snapshot ids.
2279 * For now we have a fixed upper limit on the number we're
2280 * prepared to receive.
2281 */
2282 size = sizeof (__le64) + sizeof (__le32) +
2283 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2284 reply_buf = kzalloc(size, GFP_KERNEL);
2285 if (!reply_buf)
2286 return -ENOMEM;
2287
2288 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2289 "rbd", "get_snapcontext",
2290 NULL, 0,
2291 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002292 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002293 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2294 if (ret < 0)
2295 goto out;
2296
2297 ret = -ERANGE;
2298 p = reply_buf;
2299 end = (char *) reply_buf + size;
2300 ceph_decode_64_safe(&p, end, seq, out);
2301 ceph_decode_32_safe(&p, end, snap_count, out);
2302
2303 /*
2304 * Make sure the reported number of snapshot ids wouldn't go
2305 * beyond the end of our buffer. But before checking that,
2306 * make sure the computed size of the snapshot context we
2307 * allocate is representable in a size_t.
2308 */
2309 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2310 / sizeof (u64)) {
2311 ret = -EINVAL;
2312 goto out;
2313 }
2314 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2315 goto out;
2316
2317 size = sizeof (struct ceph_snap_context) +
2318 snap_count * sizeof (snapc->snaps[0]);
2319 snapc = kmalloc(size, GFP_KERNEL);
2320 if (!snapc) {
2321 ret = -ENOMEM;
2322 goto out;
2323 }
2324
2325 atomic_set(&snapc->nref, 1);
2326 snapc->seq = seq;
2327 snapc->num_snaps = snap_count;
2328 for (i = 0; i < snap_count; i++)
2329 snapc->snaps[i] = ceph_decode_64(&p);
2330
2331 rbd_dev->header.snapc = snapc;
2332
2333 dout(" snap context seq = %llu, snap_count = %u\n",
2334 (unsigned long long) seq, (unsigned int) snap_count);
2335
2336out:
2337 kfree(reply_buf);
2338
2339 return 0;
2340}
2341
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002342static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2343{
2344 size_t size;
2345 void *reply_buf;
2346 __le64 snap_id;
2347 int ret;
2348 void *p;
2349 void *end;
2350 size_t snap_name_len;
2351 char *snap_name;
2352
2353 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2354 reply_buf = kmalloc(size, GFP_KERNEL);
2355 if (!reply_buf)
2356 return ERR_PTR(-ENOMEM);
2357
2358 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2359 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2360 "rbd", "get_snapshot_name",
2361 (char *) &snap_id, sizeof (snap_id),
2362 reply_buf, size,
2363 CEPH_OSD_FLAG_READ, NULL);
2364 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2365 if (ret < 0)
2366 goto out;
2367
2368 p = reply_buf;
2369 end = (char *) reply_buf + size;
2370 snap_name_len = 0;
2371 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2372 GFP_KERNEL);
2373 if (IS_ERR(snap_name)) {
2374 ret = PTR_ERR(snap_name);
2375 goto out;
2376 } else {
2377 dout(" snap_id 0x%016llx snap_name = %s\n",
2378 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2379 }
2380 kfree(reply_buf);
2381
2382 return snap_name;
2383out:
2384 kfree(reply_buf);
2385
2386 return ERR_PTR(ret);
2387}
2388
2389static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2390 u64 *snap_size, u64 *snap_features)
2391{
2392 __le64 snap_id;
2393 u8 order;
2394 int ret;
2395
2396 snap_id = rbd_dev->header.snapc->snaps[which];
2397 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2398 if (ret)
2399 return ERR_PTR(ret);
2400 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2401 if (ret)
2402 return ERR_PTR(ret);
2403
2404 return rbd_dev_v2_snap_name(rbd_dev, which);
2405}
2406
2407static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2408 u64 *snap_size, u64 *snap_features)
2409{
2410 if (rbd_dev->image_format == 1)
2411 return rbd_dev_v1_snap_info(rbd_dev, which,
2412 snap_size, snap_features);
2413 if (rbd_dev->image_format == 2)
2414 return rbd_dev_v2_snap_info(rbd_dev, which,
2415 snap_size, snap_features);
2416 return ERR_PTR(-EINVAL);
2417}
2418
Alex Elder117973f2012-08-31 17:29:55 -05002419static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2420{
2421 int ret;
2422 __u8 obj_order;
2423
2424 down_write(&rbd_dev->header_rwsem);
2425
2426 /* Grab old order first, to see if it changes */
2427
2428 obj_order = rbd_dev->header.obj_order,
2429 ret = rbd_dev_v2_image_size(rbd_dev);
2430 if (ret)
2431 goto out;
2432 if (rbd_dev->header.obj_order != obj_order) {
2433 ret = -EIO;
2434 goto out;
2435 }
2436 rbd_update_mapping_size(rbd_dev);
2437
2438 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2439 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2440 if (ret)
2441 goto out;
2442 ret = rbd_dev_snaps_update(rbd_dev);
2443 dout("rbd_dev_snaps_update returned %d\n", ret);
2444 if (ret)
2445 goto out;
2446 ret = rbd_dev_snaps_register(rbd_dev);
2447 dout("rbd_dev_snaps_register returned %d\n", ret);
2448out:
2449 up_write(&rbd_dev->header_rwsem);
2450
2451 return ret;
2452}
2453
Alex Elder9d475de2012-07-03 16:01:19 -05002454/*
Alex Elder35938152012-08-02 11:29:46 -05002455 * Scan the rbd device's current snapshot list and compare it to the
2456 * newly-received snapshot context. Remove any existing snapshots
2457 * not present in the new snapshot context. Add a new snapshot for
2458 * any snaphots in the snapshot context not in the current list.
2459 * And verify there are no changes to snapshots we already know
2460 * about.
2461 *
2462 * Assumes the snapshots in the snapshot context are sorted by
2463 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2464 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002465 */
Alex Elder304f6802012-08-31 17:29:52 -05002466static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002467{
Alex Elder35938152012-08-02 11:29:46 -05002468 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2469 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002470 struct list_head *head = &rbd_dev->snaps;
2471 struct list_head *links = head->next;
2472 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002473
Alex Elder9fcbb802012-08-23 23:48:49 -05002474 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002475 while (index < snap_count || links != head) {
2476 u64 snap_id;
2477 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002478 char *snap_name;
2479 u64 snap_size = 0;
2480 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002481
Alex Elder35938152012-08-02 11:29:46 -05002482 snap_id = index < snap_count ? snapc->snaps[index]
2483 : CEPH_NOSNAP;
2484 snap = links != head ? list_entry(links, struct rbd_snap, node)
2485 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002486 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002487
Alex Elder35938152012-08-02 11:29:46 -05002488 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2489 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002490
Alex Elder35938152012-08-02 11:29:46 -05002491 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002492
Alex Elderf84344f2012-08-31 17:29:51 -05002493 if (rbd_dev->mapping.snap_id == snap->id)
2494 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002495 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002496 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002497 rbd_dev->mapping.snap_id == snap->id ?
2498 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002499 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002500
Alex Elder35938152012-08-02 11:29:46 -05002501 /* Done with this list entry; advance */
2502
2503 links = next;
2504 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002505 }
Alex Elder35938152012-08-02 11:29:46 -05002506
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002507 snap_name = rbd_dev_snap_info(rbd_dev, index,
2508 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002509 if (IS_ERR(snap_name))
2510 return PTR_ERR(snap_name);
2511
Alex Elder9fcbb802012-08-23 23:48:49 -05002512 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2513 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002514 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2515 struct rbd_snap *new_snap;
2516
2517 /* We haven't seen this snapshot before */
2518
Alex Elderc8d18422012-07-10 20:30:11 -05002519 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002520 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002521 if (IS_ERR(new_snap)) {
2522 int err = PTR_ERR(new_snap);
2523
2524 dout(" failed to add dev, error %d\n", err);
2525
2526 return err;
2527 }
Alex Elder35938152012-08-02 11:29:46 -05002528
2529 /* New goes before existing, or at end of list */
2530
Alex Elder9fcbb802012-08-23 23:48:49 -05002531 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002532 if (snap)
2533 list_add_tail(&new_snap->node, &snap->node);
2534 else
Alex Elder523f3252012-08-30 00:16:37 -05002535 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002536 } else {
2537 /* Already have this one */
2538
Alex Elder9fcbb802012-08-23 23:48:49 -05002539 dout(" already present\n");
2540
Alex Eldercd892122012-07-03 16:01:19 -05002541 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002542 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002543 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002544
2545 /* Done with this list entry; advance */
2546
2547 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002548 }
Alex Elder35938152012-08-02 11:29:46 -05002549
2550 /* Advance to the next entry in the snapshot context */
2551
2552 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002553 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002554 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002555
2556 return 0;
2557}
2558
Alex Elder304f6802012-08-31 17:29:52 -05002559/*
2560 * Scan the list of snapshots and register the devices for any that
2561 * have not already been registered.
2562 */
2563static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2564{
2565 struct rbd_snap *snap;
2566 int ret = 0;
2567
2568 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002569 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2570 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002571
2572 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2573 if (!rbd_snap_registered(snap)) {
2574 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2575 if (ret < 0)
2576 break;
2577 }
2578 }
2579 dout("%s: returning %d\n", __func__, ret);
2580
2581 return ret;
2582}
2583
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002584static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2585{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002586 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002587 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002588
2589 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002590
Alex Eldercd789ab2012-08-30 00:16:38 -05002591 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002592 dev->bus = &rbd_bus_type;
2593 dev->type = &rbd_device_type;
2594 dev->parent = &rbd_root_dev;
2595 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002596 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002597 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002598
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002599 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002600
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002601 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002602}
2603
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002604static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2605{
2606 device_unregister(&rbd_dev->dev);
2607}
2608
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002609static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2610{
2611 int ret, rc;
2612
2613 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002614 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002615 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002616 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002617 if (rc < 0)
2618 return rc;
2619 }
2620 } while (ret == -ERANGE);
2621
2622 return ret;
2623}
2624
Alex Eldere2839302012-08-29 17:11:06 -05002625static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002626
2627/*
Alex Elder499afd52012-02-02 08:13:29 -06002628 * Get a unique rbd identifier for the given new rbd_dev, and add
2629 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002630 */
Alex Eldere2839302012-08-29 17:11:06 -05002631static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002632{
Alex Eldere2839302012-08-29 17:11:06 -05002633 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002634
2635 spin_lock(&rbd_dev_list_lock);
2636 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2637 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002638 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2639 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002640}
Alex Elderb7f23c32012-01-29 13:57:43 -06002641
Alex Elder1ddbe942012-01-29 13:57:44 -06002642/*
Alex Elder499afd52012-02-02 08:13:29 -06002643 * Remove an rbd_dev from the global list, and record that its
2644 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002645 */
Alex Eldere2839302012-08-29 17:11:06 -05002646static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002647{
Alex Elderd184f6b2012-01-29 13:57:44 -06002648 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002649 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002650 int max_id;
2651
Alex Elderaafb2302012-09-06 16:00:54 -05002652 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002653
Alex Eldere2839302012-08-29 17:11:06 -05002654 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2655 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002656 spin_lock(&rbd_dev_list_lock);
2657 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002658
2659 /*
2660 * If the id being "put" is not the current maximum, there
2661 * is nothing special we need to do.
2662 */
Alex Eldere2839302012-08-29 17:11:06 -05002663 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002664 spin_unlock(&rbd_dev_list_lock);
2665 return;
2666 }
2667
2668 /*
2669 * We need to update the current maximum id. Search the
2670 * list to find out what it is. We're more likely to find
2671 * the maximum at the end, so search the list backward.
2672 */
2673 max_id = 0;
2674 list_for_each_prev(tmp, &rbd_dev_list) {
2675 struct rbd_device *rbd_dev;
2676
2677 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002678 if (rbd_dev->dev_id > max_id)
2679 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002680 }
Alex Elder499afd52012-02-02 08:13:29 -06002681 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002682
Alex Elder1ddbe942012-01-29 13:57:44 -06002683 /*
Alex Eldere2839302012-08-29 17:11:06 -05002684 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002685 * which case it now accurately reflects the new maximum.
2686 * Be careful not to overwrite the maximum value in that
2687 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002688 */
Alex Eldere2839302012-08-29 17:11:06 -05002689 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2690 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002691}
2692
Alex Eldera725f65e2012-02-02 08:13:30 -06002693/*
Alex Eldere28fff262012-02-02 08:13:30 -06002694 * Skips over white space at *buf, and updates *buf to point to the
2695 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002696 * the token (string of non-white space characters) found. Note
2697 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002698 */
2699static inline size_t next_token(const char **buf)
2700{
2701 /*
2702 * These are the characters that produce nonzero for
2703 * isspace() in the "C" and "POSIX" locales.
2704 */
2705 const char *spaces = " \f\n\r\t\v";
2706
2707 *buf += strspn(*buf, spaces); /* Find start of token */
2708
2709 return strcspn(*buf, spaces); /* Return token length */
2710}
2711
2712/*
2713 * Finds the next token in *buf, and if the provided token buffer is
2714 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002715 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2716 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002717 *
2718 * Returns the length of the token found (not including the '\0').
2719 * Return value will be 0 if no token is found, and it will be >=
2720 * token_size if the token would not fit.
2721 *
Alex Elder593a9e72012-02-07 12:03:37 -06002722 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002723 * found token. Note that this occurs even if the token buffer is
2724 * too small to hold it.
2725 */
2726static inline size_t copy_token(const char **buf,
2727 char *token,
2728 size_t token_size)
2729{
2730 size_t len;
2731
2732 len = next_token(buf);
2733 if (len < token_size) {
2734 memcpy(token, *buf, len);
2735 *(token + len) = '\0';
2736 }
2737 *buf += len;
2738
2739 return len;
2740}
2741
2742/*
Alex Elderea3352f2012-07-09 21:04:23 -05002743 * Finds the next token in *buf, dynamically allocates a buffer big
2744 * enough to hold a copy of it, and copies the token into the new
2745 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2746 * that a duplicate buffer is created even for a zero-length token.
2747 *
2748 * Returns a pointer to the newly-allocated duplicate, or a null
2749 * pointer if memory for the duplicate was not available. If
2750 * the lenp argument is a non-null pointer, the length of the token
2751 * (not including the '\0') is returned in *lenp.
2752 *
2753 * If successful, the *buf pointer will be updated to point beyond
2754 * the end of the found token.
2755 *
2756 * Note: uses GFP_KERNEL for allocation.
2757 */
2758static inline char *dup_token(const char **buf, size_t *lenp)
2759{
2760 char *dup;
2761 size_t len;
2762
2763 len = next_token(buf);
2764 dup = kmalloc(len + 1, GFP_KERNEL);
2765 if (!dup)
2766 return NULL;
2767
2768 memcpy(dup, *buf, len);
2769 *(dup + len) = '\0';
2770 *buf += len;
2771
2772 if (lenp)
2773 *lenp = len;
2774
2775 return dup;
2776}
2777
2778/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002779 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2780 * rbd_md_name, and name fields of the given rbd_dev, based on the
2781 * list of monitor addresses and other options provided via
2782 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2783 * copy of the snapshot name to map if successful, or a
2784 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002785 *
2786 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002787 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002788static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2789 const char *buf,
2790 const char **mon_addrs,
2791 size_t *mon_addrs_size,
2792 char *options,
2793 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002794{
Alex Elderd22f76e2012-07-12 10:46:35 -05002795 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002796 char *err_ptr = ERR_PTR(-EINVAL);
2797 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002798
2799 /* The first four tokens are required */
2800
Alex Elder7ef32142012-02-02 08:13:30 -06002801 len = next_token(&buf);
2802 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002803 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002804 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002805 *mon_addrs = buf;
2806
2807 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002808
Alex Eldere28fff262012-02-02 08:13:30 -06002809 len = copy_token(&buf, options, options_size);
2810 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002811 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002812
Alex Elder3feeb8942012-08-31 17:29:52 -05002813 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002814 rbd_dev->pool_name = dup_token(&buf, NULL);
2815 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002816 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002817
Alex Elder0bed54d2012-07-03 16:01:18 -05002818 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2819 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002820 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002821
Alex Elderd4b125e2012-07-03 16:01:19 -05002822 /* Snapshot name is optional; default is to use "head" */
2823
Alex Elder3feeb8942012-08-31 17:29:52 -05002824 len = next_token(&buf);
Alex Elderd4b125e2012-07-03 16:01:19 -05002825 if (len > RBD_MAX_SNAP_NAME_LEN) {
2826 err_ptr = ERR_PTR(-ENAMETOOLONG);
2827 goto out_err;
2828 }
Alex Elder820a5f32012-07-09 21:04:24 -05002829 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002830 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2831 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002832 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002833 snap_name = kmalloc(len + 1, GFP_KERNEL);
2834 if (!snap_name)
2835 goto out_err;
2836 memcpy(snap_name, buf, len);
2837 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002838
Alex Elder3feeb8942012-08-31 17:29:52 -05002839 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002840
2841out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002842 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002843 rbd_dev->image_name = NULL;
2844 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002845 kfree(rbd_dev->pool_name);
2846 rbd_dev->pool_name = NULL;
2847
Alex Elder3feeb8942012-08-31 17:29:52 -05002848 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002849}
2850
Alex Elder589d30e2012-07-10 20:30:11 -05002851/*
2852 * An rbd format 2 image has a unique identifier, distinct from the
2853 * name given to it by the user. Internally, that identifier is
2854 * what's used to specify the names of objects related to the image.
2855 *
2856 * A special "rbd id" object is used to map an rbd image name to its
2857 * id. If that object doesn't exist, then there is no v2 rbd image
2858 * with the supplied name.
2859 *
2860 * This function will record the given rbd_dev's image_id field if
2861 * it can be determined, and in that case will return 0. If any
2862 * errors occur a negative errno will be returned and the rbd_dev's
2863 * image_id field will be unchanged (and should be NULL).
2864 */
2865static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2866{
2867 int ret;
2868 size_t size;
2869 char *object_name;
2870 void *response;
2871 void *p;
2872
2873 /*
2874 * First, see if the format 2 image id file exists, and if
2875 * so, get the image's persistent id from it.
2876 */
2877 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2878 object_name = kmalloc(size, GFP_NOIO);
2879 if (!object_name)
2880 return -ENOMEM;
2881 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2882 dout("rbd id object name is %s\n", object_name);
2883
2884 /* Response will be an encoded string, which includes a length */
2885
2886 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2887 response = kzalloc(size, GFP_NOIO);
2888 if (!response) {
2889 ret = -ENOMEM;
2890 goto out;
2891 }
2892
2893 ret = rbd_req_sync_exec(rbd_dev, object_name,
2894 "rbd", "get_id",
2895 NULL, 0,
2896 response, RBD_IMAGE_ID_LEN_MAX,
2897 CEPH_OSD_FLAG_READ, NULL);
2898 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2899 if (ret < 0)
2900 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002901 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05002902
2903 p = response;
2904 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2905 p + RBD_IMAGE_ID_LEN_MAX,
2906 &rbd_dev->image_id_len,
2907 GFP_NOIO);
2908 if (IS_ERR(rbd_dev->image_id)) {
2909 ret = PTR_ERR(rbd_dev->image_id);
2910 rbd_dev->image_id = NULL;
2911 } else {
2912 dout("image_id is %s\n", rbd_dev->image_id);
2913 }
2914out:
2915 kfree(response);
2916 kfree(object_name);
2917
2918 return ret;
2919}
2920
Alex Eldera30b71b2012-07-10 20:30:11 -05002921static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2922{
2923 int ret;
2924 size_t size;
2925
2926 /* Version 1 images have no id; empty string is used */
2927
2928 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2929 if (!rbd_dev->image_id)
2930 return -ENOMEM;
2931 rbd_dev->image_id_len = 0;
2932
2933 /* Record the header object name for this rbd image. */
2934
2935 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2936 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2937 if (!rbd_dev->header_name) {
2938 ret = -ENOMEM;
2939 goto out_err;
2940 }
2941 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2942
2943 /* Populate rbd image metadata */
2944
2945 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2946 if (ret < 0)
2947 goto out_err;
2948 rbd_dev->image_format = 1;
2949
2950 dout("discovered version 1 image, header name is %s\n",
2951 rbd_dev->header_name);
2952
2953 return 0;
2954
2955out_err:
2956 kfree(rbd_dev->header_name);
2957 rbd_dev->header_name = NULL;
2958 kfree(rbd_dev->image_id);
2959 rbd_dev->image_id = NULL;
2960
2961 return ret;
2962}
2963
2964static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2965{
2966 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05002967 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05002968 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05002969
2970 /*
2971 * Image id was filled in by the caller. Record the header
2972 * object name for this rbd image.
2973 */
2974 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2975 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2976 if (!rbd_dev->header_name)
2977 return -ENOMEM;
2978 sprintf(rbd_dev->header_name, "%s%s",
2979 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05002980
2981 /* Get the size and object order for the image */
2982
2983 ret = rbd_dev_v2_image_size(rbd_dev);
2984 if (ret < 0)
2985 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05002986
2987 /* Get the object prefix (a.k.a. block_name) for the image */
2988
2989 ret = rbd_dev_v2_object_prefix(rbd_dev);
2990 if (ret < 0)
2991 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05002992
Alex Elderd8891402012-10-09 13:50:17 -07002993 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05002994
2995 ret = rbd_dev_v2_features(rbd_dev);
2996 if (ret < 0)
2997 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05002998
Alex Elder6e14b1a2012-07-03 16:01:19 -05002999 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003000
Alex Elder6e14b1a2012-07-03 16:01:19 -05003001 rbd_dev->header.crypt_type = 0;
3002 rbd_dev->header.comp_type = 0;
3003
3004 /* Get the snapshot context, plus the header version */
3005
3006 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003007 if (ret)
3008 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003009 rbd_dev->header.obj_version = ver;
3010
Alex Eldera30b71b2012-07-10 20:30:11 -05003011 rbd_dev->image_format = 2;
3012
3013 dout("discovered version 2 image, header name is %s\n",
3014 rbd_dev->header_name);
3015
Alex Elder35152972012-08-31 17:29:55 -05003016 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003017out_err:
3018 kfree(rbd_dev->header_name);
3019 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003020 kfree(rbd_dev->header.object_prefix);
3021 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003022
3023 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003024}
3025
3026/*
3027 * Probe for the existence of the header object for the given rbd
3028 * device. For format 2 images this includes determining the image
3029 * id.
3030 */
3031static int rbd_dev_probe(struct rbd_device *rbd_dev)
3032{
3033 int ret;
3034
3035 /*
3036 * Get the id from the image id object. If it's not a
3037 * format 2 image, we'll get ENOENT back, and we'll assume
3038 * it's a format 1 image.
3039 */
3040 ret = rbd_dev_image_id(rbd_dev);
3041 if (ret)
3042 ret = rbd_dev_v1_probe(rbd_dev);
3043 else
3044 ret = rbd_dev_v2_probe(rbd_dev);
3045 if (ret)
3046 dout("probe failed, returning %d\n", ret);
3047
3048 return ret;
3049}
3050
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003051static ssize_t rbd_add(struct bus_type *bus,
3052 const char *buf,
3053 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003054{
Alex Eldercb8627c2012-07-09 21:04:23 -05003055 char *options;
3056 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06003057 const char *mon_addrs = NULL;
3058 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06003059 struct ceph_osd_client *osdc;
3060 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05003061 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003062
3063 if (!try_module_get(THIS_MODULE))
3064 return -ENODEV;
3065
Alex Elder27cc2592012-02-02 08:13:30 -06003066 options = kmalloc(count, GFP_KERNEL);
3067 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05003068 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05003069 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3070 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05003071 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003072
3073 /* static rbd_device initialization */
3074 spin_lock_init(&rbd_dev->lock);
3075 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003076 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08003077 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003078
Alex Eldera725f65e2012-02-02 08:13:30 -06003079 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05003080 snap_name = rbd_add_parse_args(rbd_dev, buf,
3081 &mon_addrs, &mon_addrs_size, options, count);
3082 if (IS_ERR(snap_name)) {
3083 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003084 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05003085 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003086
Alex Elderf8c38922012-08-10 13:12:07 -07003087 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3088 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003089 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003090
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003091 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06003092 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003093 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3094 if (rc < 0)
3095 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05003096 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003097
Alex Eldera30b71b2012-07-10 20:30:11 -05003098 rc = rbd_dev_probe(rbd_dev);
3099 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05003100 goto err_out_client;
Alex Elder05fd6f62012-08-29 17:11:07 -05003101
3102 /* no need to lock here, as rbd_dev is not registered yet */
3103 rc = rbd_dev_snaps_update(rbd_dev);
3104 if (rc)
3105 goto err_out_header;
3106
3107 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3108 if (rc)
3109 goto err_out_header;
3110
Alex Elder85ae8922012-07-26 23:37:14 -05003111 /* generate unique id: find highest unique id, add one */
3112 rbd_dev_id_get(rbd_dev);
3113
3114 /* Fill in the device name, now that we have its id. */
3115 BUILD_BUG_ON(DEV_NAME_LEN
3116 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3117 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3118
3119 /* Get our block major device number. */
3120
Alex Elder27cc2592012-02-02 08:13:30 -06003121 rc = register_blkdev(0, rbd_dev->name);
3122 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003123 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003124 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003125
Alex Elder0f308a32012-08-29 17:11:07 -05003126 /* Set up the blkdev mapping. */
3127
3128 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003129 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003130 goto err_out_blkdev;
3131
Alex Elder0f308a32012-08-29 17:11:07 -05003132 rc = rbd_bus_add_dev(rbd_dev);
3133 if (rc)
3134 goto err_out_disk;
3135
Alex Elder32eec682012-02-08 16:11:14 -06003136 /*
3137 * At this point cleanup in the event of an error is the job
3138 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003139 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003140
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003141 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003142 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003143 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003144 if (rc)
3145 goto err_out_bus;
3146
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003147 rc = rbd_init_watch_dev(rbd_dev);
3148 if (rc)
3149 goto err_out_bus;
3150
Alex Elder3ee40012012-08-29 17:11:07 -05003151 /* Everything's ready. Announce the disk to the world. */
3152
3153 add_disk(rbd_dev->disk);
3154
3155 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3156 (unsigned long long) rbd_dev->mapping.size);
3157
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003158 return count;
3159
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003160err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003161 /* this will also clean up rest of rbd_dev stuff */
3162
3163 rbd_bus_del_dev(rbd_dev);
3164 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003165 return rc;
3166
Alex Elder0f308a32012-08-29 17:11:07 -05003167err_out_disk:
3168 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003169err_out_blkdev:
3170 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003171err_out_id:
3172 rbd_dev_id_put(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05003173err_out_header:
3174 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003175err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05003176 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003177 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05003178 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05003179err_out_args:
3180 kfree(rbd_dev->mapping.snap_name);
3181 kfree(rbd_dev->image_name);
3182 kfree(rbd_dev->pool_name);
3183err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06003184 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05003185 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06003186
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003187 dout("Error adding device %s\n", buf);
3188 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003189
3190 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003191}
3192
Alex Elderde71a292012-07-03 16:01:19 -05003193static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003194{
3195 struct list_head *tmp;
3196 struct rbd_device *rbd_dev;
3197
Alex Eldere124a82f2012-01-29 13:57:44 -06003198 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003199 list_for_each(tmp, &rbd_dev_list) {
3200 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003201 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06003202 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003203 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06003204 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003205 }
Alex Eldere124a82f2012-01-29 13:57:44 -06003206 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003207 return NULL;
3208}
3209
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003210static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003211{
Alex Elder593a9e72012-02-07 12:03:37 -06003212 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003213
Alex Elder1dbb4392012-01-24 10:08:37 -06003214 if (rbd_dev->watch_request) {
3215 struct ceph_client *client = rbd_dev->rbd_client->client;
3216
3217 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003218 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003219 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003220 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003221 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003222
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003223 rbd_put_client(rbd_dev);
3224
3225 /* clean up and free blkdev */
3226 rbd_free_disk(rbd_dev);
3227 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003228
Alex Elder2ac4e752012-07-10 20:30:10 -05003229 /* release allocated disk header fields */
3230 rbd_header_free(&rbd_dev->header);
3231
Alex Elder32eec682012-02-08 16:11:14 -06003232 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05003233 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003234 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05003235 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05003236 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05003237 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05003238 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003239 kfree(rbd_dev);
3240
3241 /* release module ref */
3242 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003243}
3244
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003245static ssize_t rbd_remove(struct bus_type *bus,
3246 const char *buf,
3247 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003248{
3249 struct rbd_device *rbd_dev = NULL;
3250 int target_id, rc;
3251 unsigned long ul;
3252 int ret = count;
3253
3254 rc = strict_strtoul(buf, 10, &ul);
3255 if (rc)
3256 return rc;
3257
3258 /* convert to int; abort if we lost anything in the conversion */
3259 target_id = (int) ul;
3260 if (target_id != ul)
3261 return -EINVAL;
3262
3263 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3264
3265 rbd_dev = __rbd_get_dev(target_id);
3266 if (!rbd_dev) {
3267 ret = -ENOENT;
3268 goto done;
3269 }
3270
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003271 __rbd_remove_all_snaps(rbd_dev);
3272 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003273
3274done:
3275 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003276
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003277 return ret;
3278}
3279
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003280/*
3281 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003282 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003283 */
3284static int rbd_sysfs_init(void)
3285{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003286 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003287
Alex Elderfed4c142012-02-07 12:03:36 -06003288 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003289 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003290 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003291
Alex Elderfed4c142012-02-07 12:03:36 -06003292 ret = bus_register(&rbd_bus_type);
3293 if (ret < 0)
3294 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003295
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003296 return ret;
3297}
3298
3299static void rbd_sysfs_cleanup(void)
3300{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003301 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003302 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003303}
3304
3305int __init rbd_init(void)
3306{
3307 int rc;
3308
3309 rc = rbd_sysfs_init();
3310 if (rc)
3311 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003312 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003313 return 0;
3314}
3315
3316void __exit rbd_exit(void)
3317{
3318 rbd_sysfs_cleanup();
3319}
3320
3321module_init(rbd_init);
3322module_exit(rbd_exit);
3323
3324MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3325MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3326MODULE_DESCRIPTION("rados block device");
3327
3328/* following authorship retained from original osdblk.c */
3329MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3330
3331MODULE_LICENSE("GPL");