blob: ae16cf615f02bf396763faa1450f8f06f07213e9 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050075
Alex Elderd8891402012-10-09 13:50:17 -070076/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
Alex Elder81a89792012-02-02 08:13:30 -060084/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060091#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Eldercc0538b2012-08-10 13:12:07 -070093#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070094
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
112 u64 obj_version;
113};
114
115struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700116 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117};
118
119/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600120 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600129 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700130 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145};
146
Alex Elderf0f8cef2012-01-29 13:57:44 -0600147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800159struct rbd_snap {
160 struct device dev;
161 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800162 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800163 struct list_head node;
164 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500165 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166};
167
Alex Elderf84344f2012-08-31 17:29:51 -0500168struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500169 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500170 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500171 bool read_only;
172};
173
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700174/*
175 * a single device
176 */
177struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500178 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700179
180 int major; /* blkdev assigned major */
181 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
Alex Eldera30b71b2012-07-10 20:30:11 -0500183 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700184 struct rbd_client *rbd_client;
185
186 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
187
188 spinlock_t lock; /* queue lock */
189
190 struct rbd_image_header header;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500191 bool exists;
Alex Elder589d30e2012-07-10 20:30:11 -0500192 char *image_id;
193 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500194 char *image_name;
195 size_t image_name_len;
196 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500197 char *pool_name;
Alex Elder86992092012-10-25 23:34:41 -0500198 u64 pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199
Alex Elder971f8392012-10-25 23:34:41 -0500200 char *snap_name;
201 u64 snap_id;
202
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700203 struct ceph_osd_event *watch_event;
204 struct ceph_osd_request *watch_request;
205
Josh Durginc6666012011-11-21 17:11:12 -0800206 /* protects updating the header */
207 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500208
209 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700210
211 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800212
213 /* list of snapshots */
214 struct list_head snaps;
215
216 /* sysfs related */
217 struct device dev;
218};
219
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700220static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600221
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700222static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600223static DEFINE_SPINLOCK(rbd_dev_list_lock);
224
Alex Elder432b8582012-01-29 13:57:44 -0600225static LIST_HEAD(rbd_client_list); /* clients */
226static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700227
Alex Elder304f6802012-08-31 17:29:52 -0500228static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
229static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500232static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233
Alex Elderf0f8cef2012-01-29 13:57:44 -0600234static ssize_t rbd_add(struct bus_type *bus, const char *buf,
235 size_t count);
236static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
237 size_t count);
238
239static struct bus_attribute rbd_bus_attrs[] = {
240 __ATTR(add, S_IWUSR, NULL, rbd_add),
241 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
242 __ATTR_NULL
243};
244
245static struct bus_type rbd_bus_type = {
246 .name = "rbd",
247 .bus_attrs = rbd_bus_attrs,
248};
249
250static void rbd_root_dev_release(struct device *dev)
251{
252}
253
254static struct device rbd_root_dev = {
255 .init_name = "rbd",
256 .release = rbd_root_dev_release,
257};
258
Alex Elderaafb230e2012-09-06 16:00:54 -0500259#ifdef RBD_DEBUG
260#define rbd_assert(expr) \
261 if (unlikely(!(expr))) { \
262 printk(KERN_ERR "\nAssertion failure in %s() " \
263 "at line %d:\n\n" \
264 "\trbd_assert(%s);\n\n", \
265 __func__, __LINE__, #expr); \
266 BUG(); \
267 }
268#else /* !RBD_DEBUG */
269# define rbd_assert(expr) ((void) 0)
270#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800272static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
273{
274 return get_device(&rbd_dev->dev);
275}
276
277static void rbd_put_dev(struct rbd_device *rbd_dev)
278{
279 put_device(&rbd_dev->dev);
280}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281
Alex Elder117973f2012-08-31 17:29:55 -0500282static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
283static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700284
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700285static int rbd_open(struct block_device *bdev, fmode_t mode)
286{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600287 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700288
Alex Elderf84344f2012-08-31 17:29:51 -0500289 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700290 return -EROFS;
291
Alex Elder340c7a22012-08-10 13:12:07 -0700292 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500293 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 return 0;
296}
297
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800298static int rbd_release(struct gendisk *disk, fmode_t mode)
299{
300 struct rbd_device *rbd_dev = disk->private_data;
301
302 rbd_put_dev(rbd_dev);
303
304 return 0;
305}
306
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307static const struct block_device_operations rbd_bd_ops = {
308 .owner = THIS_MODULE,
309 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800310 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700311};
312
313/*
314 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500315 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316 */
Alex Elderf8c38922012-08-10 13:12:07 -0700317static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318{
319 struct rbd_client *rbdc;
320 int ret = -ENOMEM;
321
322 dout("rbd_client_create\n");
323 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
324 if (!rbdc)
325 goto out_opt;
326
327 kref_init(&rbdc->kref);
328 INIT_LIST_HEAD(&rbdc->node);
329
Alex Elderbc534d82012-01-29 13:57:44 -0600330 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
331
Alex Elder43ae4702012-07-03 16:01:18 -0500332 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600334 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500335 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336
337 ret = ceph_open_session(rbdc->client);
338 if (ret < 0)
339 goto out_err;
340
Alex Elder432b8582012-01-29 13:57:44 -0600341 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700342 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600343 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
Alex Elderbc534d82012-01-29 13:57:44 -0600345 mutex_unlock(&ctl_mutex);
346
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347 dout("rbd_client_create created %p\n", rbdc);
348 return rbdc;
349
350out_err:
351 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600352out_mutex:
353 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354 kfree(rbdc);
355out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500356 if (ceph_opts)
357 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400358 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700359}
360
361/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700362 * Find a ceph client with specific addr and configuration. If
363 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700365static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700366{
367 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700368 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369
Alex Elder43ae4702012-07-03 16:01:18 -0500370 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371 return NULL;
372
Alex Elder1f7ba332012-08-10 13:12:07 -0700373 spin_lock(&rbd_client_list_lock);
374 list_for_each_entry(client_node, &rbd_client_list, node) {
375 if (!ceph_compare_options(ceph_opts, client_node->client)) {
376 kref_get(&client_node->kref);
377 found = true;
378 break;
379 }
380 }
381 spin_unlock(&rbd_client_list_lock);
382
383 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700384}
385
386/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700387 * mount options
388 */
389enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700390 Opt_last_int,
391 /* int args above */
392 Opt_last_string,
393 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700394 Opt_read_only,
395 Opt_read_write,
396 /* Boolean args above */
397 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398};
399
Alex Elder43ae4702012-07-03 16:01:18 -0500400static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700401 /* int args above */
402 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500403 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700404 {Opt_read_only, "ro"}, /* Alternate spelling */
405 {Opt_read_write, "read_write"},
406 {Opt_read_write, "rw"}, /* Alternate spelling */
407 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408 {-1, NULL}
409};
410
411static int parse_rbd_opts_token(char *c, void *private)
412{
Alex Elder43ae4702012-07-03 16:01:18 -0500413 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700414 substring_t argstr[MAX_OPT_ARGS];
415 int token, intval, ret;
416
Alex Elder43ae4702012-07-03 16:01:18 -0500417 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700418 if (token < 0)
419 return -EINVAL;
420
421 if (token < Opt_last_int) {
422 ret = match_int(&argstr[0], &intval);
423 if (ret < 0) {
424 pr_err("bad mount option arg (not int) "
425 "at '%s'\n", c);
426 return ret;
427 }
428 dout("got int token %d val %d\n", token, intval);
429 } else if (token > Opt_last_int && token < Opt_last_string) {
430 dout("got string token %d val %s\n", token,
431 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700432 } else if (token > Opt_last_string && token < Opt_last_bool) {
433 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434 } else {
435 dout("got token %d\n", token);
436 }
437
438 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700439 case Opt_read_only:
440 rbd_opts->read_only = true;
441 break;
442 case Opt_read_write:
443 rbd_opts->read_only = false;
444 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700445 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500446 rbd_assert(false);
447 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700448 }
449 return 0;
450}
451
452/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 * Get a ceph client with specific addr and configuration, if one does
454 * not exist create it.
455 */
Alex Elder78cea762012-10-25 23:34:41 -0500456static int rbd_get_client(struct rbd_device *rbd_dev,
457 struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458{
Alex Elderf8c38922012-08-10 13:12:07 -0700459 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700460
Alex Elder1f7ba332012-08-10 13:12:07 -0700461 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700462 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600463 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500464 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700465 } else {
466 rbdc = rbd_client_create(ceph_opts);
467 if (IS_ERR(rbdc))
468 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 }
Alex Elderf8c38922012-08-10 13:12:07 -0700470 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471
Alex Elderf8c38922012-08-10 13:12:07 -0700472 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700473}
474
475/*
476 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600477 *
Alex Elder432b8582012-01-29 13:57:44 -0600478 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700479 */
480static void rbd_client_release(struct kref *kref)
481{
482 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
483
484 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500485 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500487 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488
489 ceph_destroy_client(rbdc->client);
490 kfree(rbdc);
491}
492
493/*
494 * Drop reference to ceph client node. If it's not referenced anymore, release
495 * it.
496 */
497static void rbd_put_client(struct rbd_device *rbd_dev)
498{
499 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
500 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700501}
502
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700503/*
504 * Destroy requests collection
505 */
506static void rbd_coll_release(struct kref *kref)
507{
508 struct rbd_req_coll *coll =
509 container_of(kref, struct rbd_req_coll, kref);
510
511 dout("rbd_coll_release %p\n", coll);
512 kfree(coll);
513}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514
Alex Eldera30b71b2012-07-10 20:30:11 -0500515static bool rbd_image_format_valid(u32 image_format)
516{
517 return image_format == 1 || image_format == 2;
518}
519
Alex Elder8e94af82012-07-25 09:32:40 -0500520static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
521{
Alex Elder103a1502012-08-02 11:29:45 -0500522 size_t size;
523 u32 snap_count;
524
525 /* The header has to start with the magic rbd header text */
526 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
527 return false;
528
Alex Elderdb2388b2012-10-20 22:17:27 -0500529 /* The bio layer requires at least sector-sized I/O */
530
531 if (ondisk->options.order < SECTOR_SHIFT)
532 return false;
533
534 /* If we use u64 in a few spots we may be able to loosen this */
535
536 if (ondisk->options.order > 8 * sizeof (int) - 1)
537 return false;
538
Alex Elder103a1502012-08-02 11:29:45 -0500539 /*
540 * The size of a snapshot header has to fit in a size_t, and
541 * that limits the number of snapshots.
542 */
543 snap_count = le32_to_cpu(ondisk->snap_count);
544 size = SIZE_MAX - sizeof (struct ceph_snap_context);
545 if (snap_count > size / sizeof (__le64))
546 return false;
547
548 /*
549 * Not only that, but the size of the entire the snapshot
550 * header must also be representable in a size_t.
551 */
552 size -= snap_count * sizeof (__le64);
553 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
554 return false;
555
556 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500557}
558
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700559/*
560 * Create a new header structure, translate header format from the on-disk
561 * header.
562 */
563static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500564 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565{
Alex Elderccece232012-07-10 20:30:10 -0500566 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500567 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500568 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500569 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570
Alex Elder6a523252012-07-19 17:12:59 -0500571 memset(header, 0, sizeof (*header));
572
Alex Elder103a1502012-08-02 11:29:45 -0500573 snap_count = le32_to_cpu(ondisk->snap_count);
574
Alex Elder58c17b02012-08-23 23:22:06 -0500575 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
576 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500577 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500579 memcpy(header->object_prefix, ondisk->object_prefix, len);
580 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600581
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700582 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500583 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
584
Alex Elder621901d2012-08-23 23:22:06 -0500585 /* Save a copy of the snapshot names */
586
Alex Elderf785cc12012-08-23 23:22:06 -0500587 if (snap_names_len > (u64) SIZE_MAX)
588 return -EIO;
589 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500591 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500592 /*
593 * Note that rbd_dev_v1_header_read() guarantees
594 * the ondisk buffer we're working with has
595 * snap_names_len bytes beyond the end of the
596 * snapshot id array, this memcpy() is safe.
597 */
598 memcpy(header->snap_names, &ondisk->snaps[snap_count],
599 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500600
Alex Elder621901d2012-08-23 23:22:06 -0500601 /* Record each snapshot's size */
602
Alex Elderd2bb24e2012-07-26 23:37:14 -0500603 size = snap_count * sizeof (*header->snap_sizes);
604 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500606 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500607 for (i = 0; i < snap_count; i++)
608 header->snap_sizes[i] =
609 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610 } else {
Alex Elderccece232012-07-10 20:30:10 -0500611 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 header->snap_names = NULL;
613 header->snap_sizes = NULL;
614 }
Alex Elder849b4262012-07-09 21:04:24 -0500615
Alex Elder34b13182012-07-13 20:35:12 -0500616 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700617 header->obj_order = ondisk->options.order;
618 header->crypt_type = ondisk->options.crypt_type;
619 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500620
Alex Elder621901d2012-08-23 23:22:06 -0500621 /* Allocate and fill in the snapshot context */
622
Alex Elderf84344f2012-08-31 17:29:51 -0500623 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500624 size = sizeof (struct ceph_snap_context);
625 size += snap_count * sizeof (header->snapc->snaps[0]);
626 header->snapc = kzalloc(size, GFP_KERNEL);
627 if (!header->snapc)
628 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629
630 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500631 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500633 for (i = 0; i < snap_count; i++)
634 header->snapc->snaps[i] =
635 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636
637 return 0;
638
Alex Elder6a523252012-07-19 17:12:59 -0500639out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500640 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500641 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500643 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500644 kfree(header->object_prefix);
645 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500646
Alex Elder00f1f362012-02-07 12:03:36 -0600647 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648}
649
Alex Elder8836b992012-08-30 14:42:15 -0500650static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652
Alex Eldere86924a2012-07-10 20:30:11 -0500653 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600654
Alex Eldere86924a2012-07-10 20:30:11 -0500655 list_for_each_entry(snap, &rbd_dev->snaps, node) {
656 if (!strcmp(snap_name, snap->name)) {
Alex Elder971f8392012-10-25 23:34:41 -0500657 rbd_dev->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500658 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500659 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600660
Alex Eldere86924a2012-07-10 20:30:11 -0500661 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600662 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 }
Alex Eldere86924a2012-07-10 20:30:11 -0500664
Alex Elder00f1f362012-02-07 12:03:36 -0600665 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666}
667
Alex Elder819d52b2012-10-25 23:34:41 -0500668static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669{
Alex Elder78dc4472012-07-19 08:49:18 -0500670 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671
Alex Elder819d52b2012-10-25 23:34:41 -0500672 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800673 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder971f8392012-10-25 23:34:41 -0500674 rbd_dev->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500675 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500676 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500677 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678 } else {
Alex Elder819d52b2012-10-25 23:34:41 -0500679 ret = snap_by_name(rbd_dev, rbd_dev->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680 if (ret < 0)
681 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500682 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683 }
Alex Elderdaba5fd2012-10-26 17:25:23 -0500684 rbd_dev->exists = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686 return ret;
687}
688
689static void rbd_header_free(struct rbd_image_header *header)
690{
Alex Elder849b4262012-07-09 21:04:24 -0500691 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500692 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700693 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500694 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500695 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500696 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800697 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500698 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699}
700
Alex Elder65ccfe22012-08-09 10:33:26 -0700701static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702{
Alex Elder65ccfe22012-08-09 10:33:26 -0700703 char *name;
704 u64 segment;
705 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706
Alex Elder65ccfe22012-08-09 10:33:26 -0700707 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
708 if (!name)
709 return NULL;
710 segment = offset >> rbd_dev->header.obj_order;
711 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
712 rbd_dev->header.object_prefix, segment);
713 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
714 pr_err("error formatting segment name for #%llu (%d)\n",
715 segment, ret);
716 kfree(name);
717 name = NULL;
718 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719
Alex Elder65ccfe22012-08-09 10:33:26 -0700720 return name;
721}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700722
Alex Elder65ccfe22012-08-09 10:33:26 -0700723static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
724{
725 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726
Alex Elder65ccfe22012-08-09 10:33:26 -0700727 return offset & (segment_size - 1);
728}
729
730static u64 rbd_segment_length(struct rbd_device *rbd_dev,
731 u64 offset, u64 length)
732{
733 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
734
735 offset &= segment_size - 1;
736
Alex Elderaafb230e2012-09-06 16:00:54 -0500737 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700738 if (offset + length > segment_size)
739 length = segment_size - offset;
740
741 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742}
743
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700744static int rbd_get_num_segments(struct rbd_image_header *header,
745 u64 ofs, u64 len)
746{
Alex Elderdf111be2012-08-09 10:33:26 -0700747 u64 start_seg;
748 u64 end_seg;
749
750 if (!len)
751 return 0;
752 if (len - 1 > U64_MAX - ofs)
753 return -ERANGE;
754
755 start_seg = ofs >> header->obj_order;
756 end_seg = (ofs + len - 1) >> header->obj_order;
757
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700758 return end_seg - start_seg + 1;
759}
760
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700761/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700762 * returns the size of an object in the image
763 */
764static u64 rbd_obj_bytes(struct rbd_image_header *header)
765{
766 return 1 << header->obj_order;
767}
768
769/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770 * bio helpers
771 */
772
773static void bio_chain_put(struct bio *chain)
774{
775 struct bio *tmp;
776
777 while (chain) {
778 tmp = chain;
779 chain = chain->bi_next;
780 bio_put(tmp);
781 }
782}
783
784/*
785 * zeros a bio chain, starting at specific offset
786 */
787static void zero_bio_chain(struct bio *chain, int start_ofs)
788{
789 struct bio_vec *bv;
790 unsigned long flags;
791 void *buf;
792 int i;
793 int pos = 0;
794
795 while (chain) {
796 bio_for_each_segment(bv, chain, i) {
797 if (pos + bv->bv_len > start_ofs) {
798 int remainder = max(start_ofs - pos, 0);
799 buf = bvec_kmap_irq(bv, &flags);
800 memset(buf + remainder, 0,
801 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200802 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803 }
804 pos += bv->bv_len;
805 }
806
807 chain = chain->bi_next;
808 }
809}
810
811/*
Alex Elderf7760da2012-10-20 22:17:27 -0500812 * Clone a portion of a bio, starting at the given byte offset
813 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700814 */
Alex Elderf7760da2012-10-20 22:17:27 -0500815static struct bio *bio_clone_range(struct bio *bio_src,
816 unsigned int offset,
817 unsigned int len,
818 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700819{
Alex Elderf7760da2012-10-20 22:17:27 -0500820 struct bio_vec *bv;
821 unsigned int resid;
822 unsigned short idx;
823 unsigned int voff;
824 unsigned short end_idx;
825 unsigned short vcnt;
826 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700827
Alex Elderf7760da2012-10-20 22:17:27 -0500828 /* Handle the easy case for the caller */
829
830 if (!offset && len == bio_src->bi_size)
831 return bio_clone(bio_src, gfpmask);
832
833 if (WARN_ON_ONCE(!len))
834 return NULL;
835 if (WARN_ON_ONCE(len > bio_src->bi_size))
836 return NULL;
837 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
838 return NULL;
839
840 /* Find first affected segment... */
841
842 resid = offset;
843 __bio_for_each_segment(bv, bio_src, idx, 0) {
844 if (resid < bv->bv_len)
845 break;
846 resid -= bv->bv_len;
847 }
848 voff = resid;
849
850 /* ...and the last affected segment */
851
852 resid += len;
853 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
854 if (resid <= bv->bv_len)
855 break;
856 resid -= bv->bv_len;
857 }
858 vcnt = end_idx - idx + 1;
859
860 /* Build the clone */
861
862 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
863 if (!bio)
864 return NULL; /* ENOMEM */
865
866 bio->bi_bdev = bio_src->bi_bdev;
867 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
868 bio->bi_rw = bio_src->bi_rw;
869 bio->bi_flags |= 1 << BIO_CLONED;
870
871 /*
872 * Copy over our part of the bio_vec, then update the first
873 * and last (or only) entries.
874 */
875 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
876 vcnt * sizeof (struct bio_vec));
877 bio->bi_io_vec[0].bv_offset += voff;
878 if (vcnt > 1) {
879 bio->bi_io_vec[0].bv_len -= voff;
880 bio->bi_io_vec[vcnt - 1].bv_len = resid;
881 } else {
882 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700883 }
884
Alex Elderf7760da2012-10-20 22:17:27 -0500885 bio->bi_vcnt = vcnt;
886 bio->bi_size = len;
887 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700888
Alex Elderf7760da2012-10-20 22:17:27 -0500889 return bio;
890}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700891
Alex Elderf7760da2012-10-20 22:17:27 -0500892/*
893 * Clone a portion of a bio chain, starting at the given byte offset
894 * into the first bio in the source chain and continuing for the
895 * number of bytes indicated. The result is another bio chain of
896 * exactly the given length, or a null pointer on error.
897 *
898 * The bio_src and offset parameters are both in-out. On entry they
899 * refer to the first source bio and the offset into that bio where
900 * the start of data to be cloned is located.
901 *
902 * On return, bio_src is updated to refer to the bio in the source
903 * chain that contains first un-cloned byte, and *offset will
904 * contain the offset of that byte within that bio.
905 */
906static struct bio *bio_chain_clone_range(struct bio **bio_src,
907 unsigned int *offset,
908 unsigned int len,
909 gfp_t gfpmask)
910{
911 struct bio *bi = *bio_src;
912 unsigned int off = *offset;
913 struct bio *chain = NULL;
914 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700915
Alex Elderf7760da2012-10-20 22:17:27 -0500916 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700917
Alex Elderf7760da2012-10-20 22:17:27 -0500918 if (!bi || off >= bi->bi_size || !len)
919 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700920
Alex Elderf7760da2012-10-20 22:17:27 -0500921 end = &chain;
922 while (len) {
923 unsigned int bi_size;
924 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925
Alex Elderf7760da2012-10-20 22:17:27 -0500926 if (!bi)
927 goto out_err; /* EINVAL; ran out of bio's */
928 bi_size = min_t(unsigned int, bi->bi_size - off, len);
929 bio = bio_clone_range(bi, off, bi_size, gfpmask);
930 if (!bio)
931 goto out_err; /* ENOMEM */
932
933 *end = bio;
934 end = &bio->bi_next;
935
936 off += bi_size;
937 if (off == bi->bi_size) {
938 bi = bi->bi_next;
939 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700940 }
Alex Elderf7760da2012-10-20 22:17:27 -0500941 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700942 }
Alex Elderf7760da2012-10-20 22:17:27 -0500943 *bio_src = bi;
944 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700945
Alex Elderf7760da2012-10-20 22:17:27 -0500946 return chain;
947out_err:
948 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700950 return NULL;
951}
952
953/*
954 * helpers for osd request op vectors.
955 */
Alex Elder57cfc102012-06-26 12:57:03 -0700956static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
957 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958{
Alex Elder57cfc102012-06-26 12:57:03 -0700959 struct ceph_osd_req_op *ops;
960
961 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
962 if (!ops)
963 return NULL;
964
965 ops[0].op = opcode;
966
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 /*
968 * op extent offset and length will be set later on
969 * in calc_raw_layout()
970 */
Alex Elder57cfc102012-06-26 12:57:03 -0700971 ops[0].payload_len = payload_len;
972
973 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700974}
975
976static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
977{
978 kfree(ops);
979}
980
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700981static void rbd_coll_end_req_index(struct request *rq,
982 struct rbd_req_coll *coll,
983 int index,
984 int ret, u64 len)
985{
986 struct request_queue *q;
987 int min, max, i;
988
Alex Elderbd919d42012-07-13 20:35:11 -0500989 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
990 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700991
992 if (!rq)
993 return;
994
995 if (!coll) {
996 blk_end_request(rq, ret, len);
997 return;
998 }
999
1000 q = rq->q;
1001
1002 spin_lock_irq(q->queue_lock);
1003 coll->status[index].done = 1;
1004 coll->status[index].rc = ret;
1005 coll->status[index].bytes = len;
1006 max = min = coll->num_done;
1007 while (max < coll->total && coll->status[max].done)
1008 max++;
1009
1010 for (i = min; i<max; i++) {
1011 __blk_end_request(rq, coll->status[i].rc,
1012 coll->status[i].bytes);
1013 coll->num_done++;
1014 kref_put(&coll->kref, rbd_coll_release);
1015 }
1016 spin_unlock_irq(q->queue_lock);
1017}
1018
1019static void rbd_coll_end_req(struct rbd_request *req,
1020 int ret, u64 len)
1021{
1022 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1023}
1024
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001025/*
1026 * Send ceph osd request
1027 */
1028static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001029 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030 struct ceph_snap_context *snapc,
1031 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001032 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001033 struct bio *bio,
1034 struct page **pages,
1035 int num_pages,
1036 int flags,
1037 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001038 struct rbd_req_coll *coll,
1039 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001041 struct ceph_msg *msg),
1042 struct ceph_osd_request **linger_req,
1043 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001044{
1045 struct ceph_osd_request *req;
1046 struct ceph_file_layout *layout;
1047 int ret;
1048 u64 bno;
1049 struct timespec mtime = CURRENT_TIME;
1050 struct rbd_request *req_data;
1051 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001052 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001053
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001055 if (!req_data) {
1056 if (coll)
1057 rbd_coll_end_req_index(rq, coll, coll_index,
1058 -ENOMEM, len);
1059 return -ENOMEM;
1060 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001061
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001062 if (coll) {
1063 req_data->coll = coll;
1064 req_data->coll_index = coll_index;
1065 }
1066
Alex Elderf7760da2012-10-20 22:17:27 -05001067 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1068 object_name, (unsigned long long) ofs,
1069 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001070
Alex Elder0ce1a792012-07-03 16:01:18 -05001071 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001072 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1073 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001074 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001075 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001076 goto done_pages;
1077 }
1078
1079 req->r_callback = rbd_cb;
1080
1081 req_data->rq = rq;
1082 req_data->bio = bio;
1083 req_data->pages = pages;
1084 req_data->len = len;
1085
1086 req->r_priv = req_data;
1087
1088 reqhead = req->r_request->front.iov_base;
1089 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1090
Alex Elderaded07e2012-07-03 16:01:18 -05001091 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001092 req->r_oid_len = strlen(req->r_oid);
1093
1094 layout = &req->r_file_layout;
1095 memset(layout, 0, sizeof(*layout));
1096 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1097 layout->fl_stripe_count = cpu_to_le32(1);
1098 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder86992092012-10-25 23:34:41 -05001099 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001100 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1101 req, ops);
1102 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103
1104 ceph_osdc_build_request(req, ofs, &len,
1105 ops,
1106 snapc,
1107 &mtime,
1108 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001110 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001111 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001112 *linger_req = req;
1113 }
1114
Alex Elder1dbb4392012-01-24 10:08:37 -06001115 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001116 if (ret < 0)
1117 goto done_err;
1118
1119 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001120 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001121 if (ver)
1122 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001123 dout("reassert_ver=%llu\n",
1124 (unsigned long long)
1125 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001126 ceph_osdc_put_request(req);
1127 }
1128 return ret;
1129
1130done_err:
1131 bio_chain_put(req_data->bio);
1132 ceph_osdc_put_request(req);
1133done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001134 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001136 return ret;
1137}
1138
1139/*
1140 * Ceph osd op callback
1141 */
1142static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1143{
1144 struct rbd_request *req_data = req->r_priv;
1145 struct ceph_osd_reply_head *replyhead;
1146 struct ceph_osd_op *op;
1147 __s32 rc;
1148 u64 bytes;
1149 int read_op;
1150
1151 /* parse reply */
1152 replyhead = msg->front.iov_base;
1153 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1154 op = (void *)(replyhead + 1);
1155 rc = le32_to_cpu(replyhead->result);
1156 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001157 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001158
Alex Elderbd919d42012-07-13 20:35:11 -05001159 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1160 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161
1162 if (rc == -ENOENT && read_op) {
1163 zero_bio_chain(req_data->bio, 0);
1164 rc = 0;
1165 } else if (rc == 0 && read_op && bytes < req_data->len) {
1166 zero_bio_chain(req_data->bio, bytes);
1167 bytes = req_data->len;
1168 }
1169
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001170 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171
1172 if (req_data->bio)
1173 bio_chain_put(req_data->bio);
1174
1175 ceph_osdc_put_request(req);
1176 kfree(req_data);
1177}
1178
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001179static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1180{
1181 ceph_osdc_put_request(req);
1182}
1183
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001184/*
1185 * Do a synchronous ceph osd operation
1186 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001187static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001188 struct ceph_snap_context *snapc,
1189 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001190 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001191 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001192 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001193 u64 ofs, u64 inbound_size,
1194 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001195 struct ceph_osd_request **linger_req,
1196 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001197{
1198 int ret;
1199 struct page **pages;
1200 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001201
Alex Elderaafb230e2012-09-06 16:00:54 -05001202 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001203
Alex Elderf8d4de62012-07-03 16:01:19 -05001204 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001206 if (IS_ERR(pages))
1207 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208
Alex Elder0ce1a792012-07-03 16:01:18 -05001209 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001210 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001211 pages, num_pages,
1212 flags,
1213 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001214 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001215 NULL,
1216 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001217 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001218 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001219
Alex Elderf8d4de62012-07-03 16:01:19 -05001220 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1221 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223done:
1224 ceph_release_page_vector(pages, num_pages);
1225 return ret;
1226}
1227
1228/*
1229 * Do an asynchronous ceph osd operation
1230 */
1231static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001232 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001233 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001235 struct bio *bio,
1236 struct rbd_req_coll *coll,
1237 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001238{
1239 char *seg_name;
1240 u64 seg_ofs;
1241 u64 seg_len;
1242 int ret;
1243 struct ceph_osd_req_op *ops;
1244 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001245 int opcode;
1246 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001247 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001248
Alex Elder65ccfe22012-08-09 10:33:26 -07001249 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001250 if (!seg_name)
1251 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001252 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1253 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001254
Alex Elderff2e4bb2012-10-10 18:59:29 -07001255 if (rq_data_dir(rq) == WRITE) {
1256 opcode = CEPH_OSD_OP_WRITE;
1257 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001258 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001259 payload_len = seg_len;
1260 } else {
1261 opcode = CEPH_OSD_OP_READ;
1262 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001263 snapc = NULL;
Alex Elder971f8392012-10-25 23:34:41 -05001264 snapid = rbd_dev->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001265 payload_len = 0;
1266 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001267
Alex Elder57cfc102012-06-26 12:57:03 -07001268 ret = -ENOMEM;
1269 ops = rbd_create_rw_ops(1, opcode, payload_len);
1270 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001271 goto done;
1272
1273 /* we've taken care of segment sizes earlier when we
1274 cloned the bios. We should never have a segment
1275 truncated at this point */
Alex Elderaafb230e2012-09-06 16:00:54 -05001276 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001277
1278 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1279 seg_name, seg_ofs, seg_len,
1280 bio,
1281 NULL, 0,
1282 flags,
1283 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001284 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001285 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001286
1287 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001288done:
1289 kfree(seg_name);
1290 return ret;
1291}
1292
1293/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001294 * Request sync osd read
1295 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001296static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001297 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001298 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001299 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001300 char *buf,
1301 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001302{
Alex Elder913d2fd2012-06-26 12:57:03 -07001303 struct ceph_osd_req_op *ops;
1304 int ret;
1305
1306 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1307 if (!ops)
1308 return -ENOMEM;
1309
1310 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001311 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001312 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001313 ops, object_name, ofs, len, buf, NULL, ver);
1314 rbd_destroy_ops(ops);
1315
1316 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001317}
1318
1319/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001320 * Request sync osd watch
1321 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001322static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001323 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001324 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001325{
1326 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001327 int ret;
1328
Alex Elder57cfc102012-06-26 12:57:03 -07001329 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1330 if (!ops)
1331 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001332
Josh Durgina71b8912011-12-05 18:10:44 -08001333 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001334 ops[0].watch.cookie = notify_id;
1335 ops[0].watch.flag = 0;
1336
Alex Elder0ce1a792012-07-03 16:01:18 -05001337 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001338 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001339 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001340 CEPH_OSD_FLAG_READ,
1341 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001342 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343 rbd_simple_req_cb, 0, NULL);
1344
1345 rbd_destroy_ops(ops);
1346 return ret;
1347}
1348
1349static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1350{
Alex Elder0ce1a792012-07-03 16:01:18 -05001351 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001352 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001353 int rc;
1354
Alex Elder0ce1a792012-07-03 16:01:18 -05001355 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001356 return;
1357
Alex Elderbd919d42012-07-13 20:35:11 -05001358 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1359 rbd_dev->header_name, (unsigned long long) notify_id,
1360 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001361 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001362 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001363 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001364 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001365
Alex Elder7f0a24d2012-07-25 09:32:40 -05001366 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001367}
1368
1369/*
1370 * Request sync osd watch
1371 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001372static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001373{
1374 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001375 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001376 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001377
Alex Elder57cfc102012-06-26 12:57:03 -07001378 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1379 if (!ops)
1380 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001381
1382 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001383 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001384 if (ret < 0)
1385 goto fail;
1386
Alex Elder0e6f3222012-07-25 09:32:40 -05001387 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001388 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001389 ops[0].watch.flag = 1;
1390
Alex Elder0ce1a792012-07-03 16:01:18 -05001391 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001392 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001393 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1394 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001395 rbd_dev->header_name,
1396 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001397 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001398
1399 if (ret < 0)
1400 goto fail_event;
1401
1402 rbd_destroy_ops(ops);
1403 return 0;
1404
1405fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001406 ceph_osdc_cancel_event(rbd_dev->watch_event);
1407 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001408fail:
1409 rbd_destroy_ops(ops);
1410 return ret;
1411}
1412
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001413/*
1414 * Request sync osd unwatch
1415 */
Alex Elder070c6332012-07-25 09:32:41 -05001416static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001417{
1418 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001419 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001420
Alex Elder57cfc102012-06-26 12:57:03 -07001421 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1422 if (!ops)
1423 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001424
1425 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001426 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001427 ops[0].watch.flag = 0;
1428
Alex Elder0ce1a792012-07-03 16:01:18 -05001429 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001430 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001431 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1432 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001433 rbd_dev->header_name,
1434 0, 0, NULL, NULL, NULL);
1435
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001436
1437 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001438 ceph_osdc_cancel_event(rbd_dev->watch_event);
1439 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001440 return ret;
1441}
1442
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001443/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001444 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001445 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001446static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001447 const char *object_name,
1448 const char *class_name,
1449 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001450 const char *outbound,
1451 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001452 char *inbound,
1453 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001454 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001455 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001456{
1457 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001458 int class_name_len = strlen(class_name);
1459 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001460 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001461 int ret;
1462
Alex Elder3cb4a682012-06-26 12:57:03 -07001463 /*
1464 * Any input parameters required by the method we're calling
1465 * will be sent along with the class and method names as
1466 * part of the message payload. That data and its size are
1467 * supplied via the indata and indata_len fields (named from
1468 * the perspective of the server side) in the OSD request
1469 * operation.
1470 */
1471 payload_size = class_name_len + method_name_len + outbound_size;
1472 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001473 if (!ops)
1474 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001475
Alex Elderaded07e2012-07-03 16:01:18 -05001476 ops[0].cls.class_name = class_name;
1477 ops[0].cls.class_len = (__u8) class_name_len;
1478 ops[0].cls.method_name = method_name;
1479 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001481 ops[0].cls.indata = outbound;
1482 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001483
Alex Elder0ce1a792012-07-03 16:01:18 -05001484 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001485 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001486 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001487 object_name, 0, inbound_size, inbound,
1488 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001489
1490 rbd_destroy_ops(ops);
1491
1492 dout("cls_exec returned %d\n", ret);
1493 return ret;
1494}
1495
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001496static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1497{
1498 struct rbd_req_coll *coll =
1499 kzalloc(sizeof(struct rbd_req_coll) +
1500 sizeof(struct rbd_req_status) * num_reqs,
1501 GFP_ATOMIC);
1502
1503 if (!coll)
1504 return NULL;
1505 coll->total = num_reqs;
1506 kref_init(&coll->kref);
1507 return coll;
1508}
1509
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001510/*
1511 * block device queue callback
1512 */
1513static void rbd_rq_fn(struct request_queue *q)
1514{
1515 struct rbd_device *rbd_dev = q->queuedata;
1516 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001517
Alex Elder00f1f362012-02-07 12:03:36 -06001518 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001519 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001521 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001523 int num_segs, cur_seg = 0;
1524 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001525 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001526 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528 dout("fetched request\n");
1529
1530 /* filter out block requests we don't understand */
1531 if ((rq->cmd_type != REQ_TYPE_FS)) {
1532 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001533 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534 }
1535
1536 /* deduce our operation (read, write) */
1537 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001538 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001540 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001541 }
1542
1543 spin_unlock_irq(q->queue_lock);
1544
Josh Durgind1d25642011-12-05 14:03:05 -08001545 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001546
Alex Elderdaba5fd2012-10-26 17:25:23 -05001547 if (!rbd_dev->exists) {
1548 rbd_assert(rbd_dev->snap_id != CEPH_NOSNAP);
Josh Durgine88a36e2011-11-21 18:14:25 -08001549 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001550 dout("request for non-existent snapshot");
1551 spin_lock_irq(q->queue_lock);
1552 __blk_end_request_all(rq, -ENXIO);
1553 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001554 }
1555
Josh Durgind1d25642011-12-05 14:03:05 -08001556 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1557
1558 up_read(&rbd_dev->header_rwsem);
1559
Alex Elderf7760da2012-10-20 22:17:27 -05001560 size = blk_rq_bytes(rq);
1561 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1562 bio = rq->bio;
1563
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001564 dout("%s 0x%x bytes at 0x%llx\n",
1565 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001566 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001568 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001569 if (num_segs <= 0) {
1570 spin_lock_irq(q->queue_lock);
1571 __blk_end_request_all(rq, num_segs);
1572 ceph_put_snap_context(snapc);
1573 continue;
1574 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001575 coll = rbd_alloc_coll(num_segs);
1576 if (!coll) {
1577 spin_lock_irq(q->queue_lock);
1578 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001579 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001580 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001581 }
1582
Alex Elderf7760da2012-10-20 22:17:27 -05001583 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001584 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001585 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1586 unsigned int chain_size;
1587 struct bio *bio_chain;
1588
1589 BUG_ON(limit > (u64) UINT_MAX);
1590 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001591 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001592
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001593 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001594
1595 /* Pass a cloned bio chain via an osd request */
1596
1597 bio_chain = bio_chain_clone_range(&bio,
1598 &bio_offset, chain_size,
1599 GFP_ATOMIC);
1600 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001601 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001602 ofs, chain_size,
1603 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001604 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001605 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001606 -ENOMEM, chain_size);
1607 size -= chain_size;
1608 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001609
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001610 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001611 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001612 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001613
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001614 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001615
1616 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001617 }
1618}
1619
1620/*
1621 * a queue callback. Makes sure that we don't create a bio that spans across
1622 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001623 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001624 */
1625static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1626 struct bio_vec *bvec)
1627{
1628 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001629 sector_t sector_offset;
1630 sector_t sectors_per_obj;
1631 sector_t obj_sector_offset;
1632 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001633
Alex Eldere5cfeed22012-10-20 22:17:27 -05001634 /*
1635 * Find how far into its rbd object the partition-relative
1636 * bio start sector is to offset relative to the enclosing
1637 * device.
1638 */
1639 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1640 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1641 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001642
Alex Eldere5cfeed22012-10-20 22:17:27 -05001643 /*
1644 * Compute the number of bytes from that offset to the end
1645 * of the object. Account for what's already used by the bio.
1646 */
1647 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1648 if (ret > bmd->bi_size)
1649 ret -= bmd->bi_size;
1650 else
1651 ret = 0;
1652
1653 /*
1654 * Don't send back more than was asked for. And if the bio
1655 * was empty, let the whole thing through because: "Note
1656 * that a block device *must* allow a single page to be
1657 * added to an empty bio."
1658 */
1659 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1660 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1661 ret = (int) bvec->bv_len;
1662
1663 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001664}
1665
1666static void rbd_free_disk(struct rbd_device *rbd_dev)
1667{
1668 struct gendisk *disk = rbd_dev->disk;
1669
1670 if (!disk)
1671 return;
1672
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001673 if (disk->flags & GENHD_FL_UP)
1674 del_gendisk(disk);
1675 if (disk->queue)
1676 blk_cleanup_queue(disk->queue);
1677 put_disk(disk);
1678}
1679
1680/*
Alex Elder4156d992012-08-02 11:29:46 -05001681 * Read the complete header for the given rbd device.
1682 *
1683 * Returns a pointer to a dynamically-allocated buffer containing
1684 * the complete and validated header. Caller can pass the address
1685 * of a variable that will be filled in with the version of the
1686 * header object at the time it was read.
1687 *
1688 * Returns a pointer-coded errno if a failure occurs.
1689 */
1690static struct rbd_image_header_ondisk *
1691rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1692{
1693 struct rbd_image_header_ondisk *ondisk = NULL;
1694 u32 snap_count = 0;
1695 u64 names_size = 0;
1696 u32 want_count;
1697 int ret;
1698
1699 /*
1700 * The complete header will include an array of its 64-bit
1701 * snapshot ids, followed by the names of those snapshots as
1702 * a contiguous block of NUL-terminated strings. Note that
1703 * the number of snapshots could change by the time we read
1704 * it in, in which case we re-read it.
1705 */
1706 do {
1707 size_t size;
1708
1709 kfree(ondisk);
1710
1711 size = sizeof (*ondisk);
1712 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1713 size += names_size;
1714 ondisk = kmalloc(size, GFP_KERNEL);
1715 if (!ondisk)
1716 return ERR_PTR(-ENOMEM);
1717
1718 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1719 rbd_dev->header_name,
1720 0, size,
1721 (char *) ondisk, version);
1722
1723 if (ret < 0)
1724 goto out_err;
1725 if (WARN_ON((size_t) ret < size)) {
1726 ret = -ENXIO;
1727 pr_warning("short header read for image %s"
1728 " (want %zd got %d)\n",
1729 rbd_dev->image_name, size, ret);
1730 goto out_err;
1731 }
1732 if (!rbd_dev_ondisk_valid(ondisk)) {
1733 ret = -ENXIO;
1734 pr_warning("invalid header for image %s\n",
1735 rbd_dev->image_name);
1736 goto out_err;
1737 }
1738
1739 names_size = le64_to_cpu(ondisk->snap_names_len);
1740 want_count = snap_count;
1741 snap_count = le32_to_cpu(ondisk->snap_count);
1742 } while (snap_count != want_count);
1743
1744 return ondisk;
1745
1746out_err:
1747 kfree(ondisk);
1748
1749 return ERR_PTR(ret);
1750}
1751
1752/*
1753 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001754 */
1755static int rbd_read_header(struct rbd_device *rbd_dev,
1756 struct rbd_image_header *header)
1757{
Alex Elder4156d992012-08-02 11:29:46 -05001758 struct rbd_image_header_ondisk *ondisk;
1759 u64 ver = 0;
1760 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001761
Alex Elder4156d992012-08-02 11:29:46 -05001762 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1763 if (IS_ERR(ondisk))
1764 return PTR_ERR(ondisk);
1765 ret = rbd_header_from_disk(header, ondisk);
1766 if (ret >= 0)
1767 header->obj_version = ver;
1768 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001769
Alex Elder4156d992012-08-02 11:29:46 -05001770 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001771}
1772
Alex Elder41f38c22012-10-25 23:34:40 -05001773static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001774{
1775 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001776 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001777
Alex Eldera0593292012-07-19 09:09:27 -05001778 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001779 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001780}
1781
Alex Elder94785542012-10-09 13:50:17 -07001782static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1783{
1784 sector_t size;
1785
Alex Elder971f8392012-10-25 23:34:41 -05001786 if (rbd_dev->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001787 return;
1788
1789 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1790 dout("setting size to %llu sectors", (unsigned long long) size);
1791 rbd_dev->mapping.size = (u64) size;
1792 set_capacity(rbd_dev->disk, size);
1793}
1794
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001795/*
1796 * only read the first part of the ondisk header, without the snaps info
1797 */
Alex Elder117973f2012-08-31 17:29:55 -05001798static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799{
1800 int ret;
1801 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001802
1803 ret = rbd_read_header(rbd_dev, &h);
1804 if (ret < 0)
1805 return ret;
1806
Josh Durgina51aa0c2011-12-05 10:35:04 -08001807 down_write(&rbd_dev->header_rwsem);
1808
Alex Elder94785542012-10-09 13:50:17 -07001809 /* Update image size, and check for resize of mapped image */
1810 rbd_dev->header.image_size = h.image_size;
1811 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001812
Alex Elder849b4262012-07-09 21:04:24 -05001813 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001814 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001815 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001816 /* osd requests may still refer to snapc */
1817 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001818
Alex Elderb8136232012-07-25 09:32:41 -05001819 if (hver)
1820 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001821 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001822 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001823 rbd_dev->header.snapc = h.snapc;
1824 rbd_dev->header.snap_names = h.snap_names;
1825 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001826 /* Free the extra copy of the object prefix */
1827 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1828 kfree(h.object_prefix);
1829
Alex Elder304f6802012-08-31 17:29:52 -05001830 ret = rbd_dev_snaps_update(rbd_dev);
1831 if (!ret)
1832 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001833
Josh Durginc6666012011-11-21 17:11:12 -08001834 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001835
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001836 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001837}
1838
Alex Elder117973f2012-08-31 17:29:55 -05001839static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001840{
1841 int ret;
1842
Alex Elder117973f2012-08-31 17:29:55 -05001843 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001844 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001845 if (rbd_dev->image_format == 1)
1846 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1847 else
1848 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001849 mutex_unlock(&ctl_mutex);
1850
1851 return ret;
1852}
1853
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001854static int rbd_init_disk(struct rbd_device *rbd_dev)
1855{
1856 struct gendisk *disk;
1857 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001858 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001859
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001860 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001861 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1862 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001863 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001864
Alex Elderf0f8cef2012-01-29 13:57:44 -06001865 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001866 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001867 disk->major = rbd_dev->major;
1868 disk->first_minor = 0;
1869 disk->fops = &rbd_bd_ops;
1870 disk->private_data = rbd_dev;
1871
1872 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001873 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1874 if (!q)
1875 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001876
Alex Elder593a9e72012-02-07 12:03:37 -06001877 /* We use the default size, but let's be explicit about it. */
1878 blk_queue_physical_block_size(q, SECTOR_SIZE);
1879
Josh Durgin029bcbd2011-07-22 11:35:23 -07001880 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001881 segment_size = rbd_obj_bytes(&rbd_dev->header);
1882 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1883 blk_queue_max_segment_size(q, segment_size);
1884 blk_queue_io_min(q, segment_size);
1885 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001886
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001887 blk_queue_merge_bvec(q, rbd_merge_bvec);
1888 disk->queue = q;
1889
1890 q->queuedata = rbd_dev;
1891
1892 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001893
Alex Elder12f02942012-08-29 17:11:07 -05001894 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1895
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001896 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001897out_disk:
1898 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001899
1900 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001901}
1902
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001903/*
1904 sysfs
1905*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001906
Alex Elder593a9e72012-02-07 12:03:37 -06001907static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1908{
1909 return container_of(dev, struct rbd_device, dev);
1910}
1911
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001912static ssize_t rbd_size_show(struct device *dev,
1913 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001914{
Alex Elder593a9e72012-02-07 12:03:37 -06001915 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001916 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917
Josh Durgina51aa0c2011-12-05 10:35:04 -08001918 down_read(&rbd_dev->header_rwsem);
1919 size = get_capacity(rbd_dev->disk);
1920 up_read(&rbd_dev->header_rwsem);
1921
1922 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001923}
1924
Alex Elder34b13182012-07-13 20:35:12 -05001925/*
1926 * Note this shows the features for whatever's mapped, which is not
1927 * necessarily the base image.
1928 */
1929static ssize_t rbd_features_show(struct device *dev,
1930 struct device_attribute *attr, char *buf)
1931{
1932 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1933
1934 return sprintf(buf, "0x%016llx\n",
1935 (unsigned long long) rbd_dev->mapping.features);
1936}
1937
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001938static ssize_t rbd_major_show(struct device *dev,
1939 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001940{
Alex Elder593a9e72012-02-07 12:03:37 -06001941 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001942
1943 return sprintf(buf, "%d\n", rbd_dev->major);
1944}
1945
1946static ssize_t rbd_client_id_show(struct device *dev,
1947 struct device_attribute *attr, char *buf)
1948{
Alex Elder593a9e72012-02-07 12:03:37 -06001949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001950
Alex Elder1dbb4392012-01-24 10:08:37 -06001951 return sprintf(buf, "client%lld\n",
1952 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001953}
1954
1955static ssize_t rbd_pool_show(struct device *dev,
1956 struct device_attribute *attr, char *buf)
1957{
Alex Elder593a9e72012-02-07 12:03:37 -06001958 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001959
1960 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1961}
1962
Alex Elder9bb2f332012-07-12 10:46:35 -05001963static ssize_t rbd_pool_id_show(struct device *dev,
1964 struct device_attribute *attr, char *buf)
1965{
1966 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1967
Alex Elder86992092012-10-25 23:34:41 -05001968 return sprintf(buf, "%llu\n", (unsigned long long) rbd_dev->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05001969}
1970
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001971static ssize_t rbd_name_show(struct device *dev,
1972 struct device_attribute *attr, char *buf)
1973{
Alex Elder593a9e72012-02-07 12:03:37 -06001974 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001975
Alex Elder0bed54d2012-07-03 16:01:18 -05001976 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001977}
1978
Alex Elder589d30e2012-07-10 20:30:11 -05001979static ssize_t rbd_image_id_show(struct device *dev,
1980 struct device_attribute *attr, char *buf)
1981{
1982 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1983
1984 return sprintf(buf, "%s\n", rbd_dev->image_id);
1985}
1986
Alex Elder34b13182012-07-13 20:35:12 -05001987/*
1988 * Shows the name of the currently-mapped snapshot (or
1989 * RBD_SNAP_HEAD_NAME for the base image).
1990 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001991static ssize_t rbd_snap_show(struct device *dev,
1992 struct device_attribute *attr,
1993 char *buf)
1994{
Alex Elder593a9e72012-02-07 12:03:37 -06001995 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001996
Alex Elder971f8392012-10-25 23:34:41 -05001997 return sprintf(buf, "%s\n", rbd_dev->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001998}
1999
2000static ssize_t rbd_image_refresh(struct device *dev,
2001 struct device_attribute *attr,
2002 const char *buf,
2003 size_t size)
2004{
Alex Elder593a9e72012-02-07 12:03:37 -06002005 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002006 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002007
Alex Elder117973f2012-08-31 17:29:55 -05002008 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002009
2010 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002011}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002012
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002013static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002014static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002015static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2016static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2017static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002018static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002019static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002020static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002021static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2022static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002023
2024static struct attribute *rbd_attrs[] = {
2025 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002026 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002027 &dev_attr_major.attr,
2028 &dev_attr_client_id.attr,
2029 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002030 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002031 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002032 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002033 &dev_attr_current_snap.attr,
2034 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002035 NULL
2036};
2037
2038static struct attribute_group rbd_attr_group = {
2039 .attrs = rbd_attrs,
2040};
2041
2042static const struct attribute_group *rbd_attr_groups[] = {
2043 &rbd_attr_group,
2044 NULL
2045};
2046
2047static void rbd_sysfs_dev_release(struct device *dev)
2048{
2049}
2050
2051static struct device_type rbd_device_type = {
2052 .name = "rbd",
2053 .groups = rbd_attr_groups,
2054 .release = rbd_sysfs_dev_release,
2055};
2056
2057
2058/*
2059 sysfs - snapshots
2060*/
2061
2062static ssize_t rbd_snap_size_show(struct device *dev,
2063 struct device_attribute *attr,
2064 char *buf)
2065{
2066 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2067
Josh Durgin35915382011-12-05 18:25:13 -08002068 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002069}
2070
2071static ssize_t rbd_snap_id_show(struct device *dev,
2072 struct device_attribute *attr,
2073 char *buf)
2074{
2075 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2076
Josh Durgin35915382011-12-05 18:25:13 -08002077 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002078}
2079
Alex Elder34b13182012-07-13 20:35:12 -05002080static ssize_t rbd_snap_features_show(struct device *dev,
2081 struct device_attribute *attr,
2082 char *buf)
2083{
2084 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2085
2086 return sprintf(buf, "0x%016llx\n",
2087 (unsigned long long) snap->features);
2088}
2089
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002090static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2091static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002092static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002093
2094static struct attribute *rbd_snap_attrs[] = {
2095 &dev_attr_snap_size.attr,
2096 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002097 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002098 NULL,
2099};
2100
2101static struct attribute_group rbd_snap_attr_group = {
2102 .attrs = rbd_snap_attrs,
2103};
2104
2105static void rbd_snap_dev_release(struct device *dev)
2106{
2107 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2108 kfree(snap->name);
2109 kfree(snap);
2110}
2111
2112static const struct attribute_group *rbd_snap_attr_groups[] = {
2113 &rbd_snap_attr_group,
2114 NULL
2115};
2116
2117static struct device_type rbd_snap_device_type = {
2118 .groups = rbd_snap_attr_groups,
2119 .release = rbd_snap_dev_release,
2120};
2121
Alex Elder304f6802012-08-31 17:29:52 -05002122static bool rbd_snap_registered(struct rbd_snap *snap)
2123{
2124 bool ret = snap->dev.type == &rbd_snap_device_type;
2125 bool reg = device_is_registered(&snap->dev);
2126
2127 rbd_assert(!ret ^ reg);
2128
2129 return ret;
2130}
2131
Alex Elder41f38c22012-10-25 23:34:40 -05002132static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002133{
2134 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002135 if (device_is_registered(&snap->dev))
2136 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002137}
2138
Alex Elder14e70852012-07-19 09:09:27 -05002139static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002140 struct device *parent)
2141{
2142 struct device *dev = &snap->dev;
2143 int ret;
2144
2145 dev->type = &rbd_snap_device_type;
2146 dev->parent = parent;
2147 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002148 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002149 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2150
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002151 ret = device_register(dev);
2152
2153 return ret;
2154}
2155
Alex Elder4e891e02012-07-10 20:30:10 -05002156static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002157 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002158 u64 snap_id, u64 snap_size,
2159 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002160{
Alex Elder4e891e02012-07-10 20:30:10 -05002161 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002162 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002163
2164 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002165 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002166 return ERR_PTR(-ENOMEM);
2167
2168 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002169 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002170 if (!snap->name)
2171 goto err;
2172
Alex Elderc8d18422012-07-10 20:30:11 -05002173 snap->id = snap_id;
2174 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002175 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002176
2177 return snap;
2178
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002179err:
2180 kfree(snap->name);
2181 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002182
2183 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002184}
2185
Alex Eldercd892122012-07-03 16:01:19 -05002186static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2187 u64 *snap_size, u64 *snap_features)
2188{
2189 char *snap_name;
2190
2191 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2192
2193 *snap_size = rbd_dev->header.snap_sizes[which];
2194 *snap_features = 0; /* No features for v1 */
2195
2196 /* Skip over names until we find the one we are looking for */
2197
2198 snap_name = rbd_dev->header.snap_names;
2199 while (which--)
2200 snap_name += strlen(snap_name) + 1;
2201
2202 return snap_name;
2203}
2204
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002205/*
Alex Elder9d475de2012-07-03 16:01:19 -05002206 * Get the size and object order for an image snapshot, or if
2207 * snap_id is CEPH_NOSNAP, gets this information for the base
2208 * image.
2209 */
2210static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2211 u8 *order, u64 *snap_size)
2212{
2213 __le64 snapid = cpu_to_le64(snap_id);
2214 int ret;
2215 struct {
2216 u8 order;
2217 __le64 size;
2218 } __attribute__ ((packed)) size_buf = { 0 };
2219
2220 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2221 "rbd", "get_size",
2222 (char *) &snapid, sizeof (snapid),
2223 (char *) &size_buf, sizeof (size_buf),
2224 CEPH_OSD_FLAG_READ, NULL);
2225 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2226 if (ret < 0)
2227 return ret;
2228
2229 *order = size_buf.order;
2230 *snap_size = le64_to_cpu(size_buf.size);
2231
2232 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2233 (unsigned long long) snap_id, (unsigned int) *order,
2234 (unsigned long long) *snap_size);
2235
2236 return 0;
2237}
2238
2239static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2240{
2241 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2242 &rbd_dev->header.obj_order,
2243 &rbd_dev->header.image_size);
2244}
2245
Alex Elder1e130192012-07-03 16:01:19 -05002246static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2247{
2248 void *reply_buf;
2249 int ret;
2250 void *p;
2251
2252 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2253 if (!reply_buf)
2254 return -ENOMEM;
2255
2256 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2257 "rbd", "get_object_prefix",
2258 NULL, 0,
2259 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2260 CEPH_OSD_FLAG_READ, NULL);
2261 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2262 if (ret < 0)
2263 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002264 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002265
2266 p = reply_buf;
2267 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2268 p + RBD_OBJ_PREFIX_LEN_MAX,
2269 NULL, GFP_NOIO);
2270
2271 if (IS_ERR(rbd_dev->header.object_prefix)) {
2272 ret = PTR_ERR(rbd_dev->header.object_prefix);
2273 rbd_dev->header.object_prefix = NULL;
2274 } else {
2275 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2276 }
2277
2278out:
2279 kfree(reply_buf);
2280
2281 return ret;
2282}
2283
Alex Elderb1b54022012-07-03 16:01:19 -05002284static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2285 u64 *snap_features)
2286{
2287 __le64 snapid = cpu_to_le64(snap_id);
2288 struct {
2289 __le64 features;
2290 __le64 incompat;
2291 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002292 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002293 int ret;
2294
2295 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2296 "rbd", "get_features",
2297 (char *) &snapid, sizeof (snapid),
2298 (char *) &features_buf, sizeof (features_buf),
2299 CEPH_OSD_FLAG_READ, NULL);
2300 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2301 if (ret < 0)
2302 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002303
2304 incompat = le64_to_cpu(features_buf.incompat);
2305 if (incompat & ~RBD_FEATURES_ALL)
2306 return -ENOTSUPP;
2307
Alex Elderb1b54022012-07-03 16:01:19 -05002308 *snap_features = le64_to_cpu(features_buf.features);
2309
2310 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2311 (unsigned long long) snap_id,
2312 (unsigned long long) *snap_features,
2313 (unsigned long long) le64_to_cpu(features_buf.incompat));
2314
2315 return 0;
2316}
2317
2318static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2319{
2320 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2321 &rbd_dev->header.features);
2322}
2323
Alex Elder6e14b1a2012-07-03 16:01:19 -05002324static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002325{
2326 size_t size;
2327 int ret;
2328 void *reply_buf;
2329 void *p;
2330 void *end;
2331 u64 seq;
2332 u32 snap_count;
2333 struct ceph_snap_context *snapc;
2334 u32 i;
2335
2336 /*
2337 * We'll need room for the seq value (maximum snapshot id),
2338 * snapshot count, and array of that many snapshot ids.
2339 * For now we have a fixed upper limit on the number we're
2340 * prepared to receive.
2341 */
2342 size = sizeof (__le64) + sizeof (__le32) +
2343 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2344 reply_buf = kzalloc(size, GFP_KERNEL);
2345 if (!reply_buf)
2346 return -ENOMEM;
2347
2348 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2349 "rbd", "get_snapcontext",
2350 NULL, 0,
2351 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002352 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002353 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2354 if (ret < 0)
2355 goto out;
2356
2357 ret = -ERANGE;
2358 p = reply_buf;
2359 end = (char *) reply_buf + size;
2360 ceph_decode_64_safe(&p, end, seq, out);
2361 ceph_decode_32_safe(&p, end, snap_count, out);
2362
2363 /*
2364 * Make sure the reported number of snapshot ids wouldn't go
2365 * beyond the end of our buffer. But before checking that,
2366 * make sure the computed size of the snapshot context we
2367 * allocate is representable in a size_t.
2368 */
2369 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2370 / sizeof (u64)) {
2371 ret = -EINVAL;
2372 goto out;
2373 }
2374 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2375 goto out;
2376
2377 size = sizeof (struct ceph_snap_context) +
2378 snap_count * sizeof (snapc->snaps[0]);
2379 snapc = kmalloc(size, GFP_KERNEL);
2380 if (!snapc) {
2381 ret = -ENOMEM;
2382 goto out;
2383 }
2384
2385 atomic_set(&snapc->nref, 1);
2386 snapc->seq = seq;
2387 snapc->num_snaps = snap_count;
2388 for (i = 0; i < snap_count; i++)
2389 snapc->snaps[i] = ceph_decode_64(&p);
2390
2391 rbd_dev->header.snapc = snapc;
2392
2393 dout(" snap context seq = %llu, snap_count = %u\n",
2394 (unsigned long long) seq, (unsigned int) snap_count);
2395
2396out:
2397 kfree(reply_buf);
2398
2399 return 0;
2400}
2401
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002402static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2403{
2404 size_t size;
2405 void *reply_buf;
2406 __le64 snap_id;
2407 int ret;
2408 void *p;
2409 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002410 char *snap_name;
2411
2412 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2413 reply_buf = kmalloc(size, GFP_KERNEL);
2414 if (!reply_buf)
2415 return ERR_PTR(-ENOMEM);
2416
2417 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2418 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2419 "rbd", "get_snapshot_name",
2420 (char *) &snap_id, sizeof (snap_id),
2421 reply_buf, size,
2422 CEPH_OSD_FLAG_READ, NULL);
2423 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2424 if (ret < 0)
2425 goto out;
2426
2427 p = reply_buf;
2428 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002429 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002430 if (IS_ERR(snap_name)) {
2431 ret = PTR_ERR(snap_name);
2432 goto out;
2433 } else {
2434 dout(" snap_id 0x%016llx snap_name = %s\n",
2435 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2436 }
2437 kfree(reply_buf);
2438
2439 return snap_name;
2440out:
2441 kfree(reply_buf);
2442
2443 return ERR_PTR(ret);
2444}
2445
2446static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2447 u64 *snap_size, u64 *snap_features)
2448{
2449 __le64 snap_id;
2450 u8 order;
2451 int ret;
2452
2453 snap_id = rbd_dev->header.snapc->snaps[which];
2454 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2455 if (ret)
2456 return ERR_PTR(ret);
2457 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2458 if (ret)
2459 return ERR_PTR(ret);
2460
2461 return rbd_dev_v2_snap_name(rbd_dev, which);
2462}
2463
2464static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2465 u64 *snap_size, u64 *snap_features)
2466{
2467 if (rbd_dev->image_format == 1)
2468 return rbd_dev_v1_snap_info(rbd_dev, which,
2469 snap_size, snap_features);
2470 if (rbd_dev->image_format == 2)
2471 return rbd_dev_v2_snap_info(rbd_dev, which,
2472 snap_size, snap_features);
2473 return ERR_PTR(-EINVAL);
2474}
2475
Alex Elder117973f2012-08-31 17:29:55 -05002476static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2477{
2478 int ret;
2479 __u8 obj_order;
2480
2481 down_write(&rbd_dev->header_rwsem);
2482
2483 /* Grab old order first, to see if it changes */
2484
2485 obj_order = rbd_dev->header.obj_order,
2486 ret = rbd_dev_v2_image_size(rbd_dev);
2487 if (ret)
2488 goto out;
2489 if (rbd_dev->header.obj_order != obj_order) {
2490 ret = -EIO;
2491 goto out;
2492 }
2493 rbd_update_mapping_size(rbd_dev);
2494
2495 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2496 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2497 if (ret)
2498 goto out;
2499 ret = rbd_dev_snaps_update(rbd_dev);
2500 dout("rbd_dev_snaps_update returned %d\n", ret);
2501 if (ret)
2502 goto out;
2503 ret = rbd_dev_snaps_register(rbd_dev);
2504 dout("rbd_dev_snaps_register returned %d\n", ret);
2505out:
2506 up_write(&rbd_dev->header_rwsem);
2507
2508 return ret;
2509}
2510
Alex Elder9d475de2012-07-03 16:01:19 -05002511/*
Alex Elder35938152012-08-02 11:29:46 -05002512 * Scan the rbd device's current snapshot list and compare it to the
2513 * newly-received snapshot context. Remove any existing snapshots
2514 * not present in the new snapshot context. Add a new snapshot for
2515 * any snaphots in the snapshot context not in the current list.
2516 * And verify there are no changes to snapshots we already know
2517 * about.
2518 *
2519 * Assumes the snapshots in the snapshot context are sorted by
2520 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2521 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002522 */
Alex Elder304f6802012-08-31 17:29:52 -05002523static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002524{
Alex Elder35938152012-08-02 11:29:46 -05002525 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2526 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002527 struct list_head *head = &rbd_dev->snaps;
2528 struct list_head *links = head->next;
2529 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002530
Alex Elder9fcbb802012-08-23 23:48:49 -05002531 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002532 while (index < snap_count || links != head) {
2533 u64 snap_id;
2534 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002535 char *snap_name;
2536 u64 snap_size = 0;
2537 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002538
Alex Elder35938152012-08-02 11:29:46 -05002539 snap_id = index < snap_count ? snapc->snaps[index]
2540 : CEPH_NOSNAP;
2541 snap = links != head ? list_entry(links, struct rbd_snap, node)
2542 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05002543 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002544
Alex Elder35938152012-08-02 11:29:46 -05002545 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2546 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002547
Alex Elder35938152012-08-02 11:29:46 -05002548 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002549
Alex Elder971f8392012-10-25 23:34:41 -05002550 if (rbd_dev->snap_id == snap->id)
Alex Elderdaba5fd2012-10-26 17:25:23 -05002551 rbd_dev->exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002552 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002553 dout("%ssnap id %llu has been removed\n",
Alex Elder971f8392012-10-25 23:34:41 -05002554 rbd_dev->snap_id == snap->id ? "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002555 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002556
Alex Elder35938152012-08-02 11:29:46 -05002557 /* Done with this list entry; advance */
2558
2559 links = next;
2560 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002561 }
Alex Elder35938152012-08-02 11:29:46 -05002562
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002563 snap_name = rbd_dev_snap_info(rbd_dev, index,
2564 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002565 if (IS_ERR(snap_name))
2566 return PTR_ERR(snap_name);
2567
Alex Elder9fcbb802012-08-23 23:48:49 -05002568 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2569 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002570 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2571 struct rbd_snap *new_snap;
2572
2573 /* We haven't seen this snapshot before */
2574
Alex Elderc8d18422012-07-10 20:30:11 -05002575 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002576 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002577 if (IS_ERR(new_snap)) {
2578 int err = PTR_ERR(new_snap);
2579
2580 dout(" failed to add dev, error %d\n", err);
2581
2582 return err;
2583 }
Alex Elder35938152012-08-02 11:29:46 -05002584
2585 /* New goes before existing, or at end of list */
2586
Alex Elder9fcbb802012-08-23 23:48:49 -05002587 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002588 if (snap)
2589 list_add_tail(&new_snap->node, &snap->node);
2590 else
Alex Elder523f3252012-08-30 00:16:37 -05002591 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002592 } else {
2593 /* Already have this one */
2594
Alex Elder9fcbb802012-08-23 23:48:49 -05002595 dout(" already present\n");
2596
Alex Eldercd892122012-07-03 16:01:19 -05002597 rbd_assert(snap->size == snap_size);
Alex Elderaafb230e2012-09-06 16:00:54 -05002598 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002599 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002600
2601 /* Done with this list entry; advance */
2602
2603 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002604 }
Alex Elder35938152012-08-02 11:29:46 -05002605
2606 /* Advance to the next entry in the snapshot context */
2607
2608 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002609 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002610 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002611
2612 return 0;
2613}
2614
Alex Elder304f6802012-08-31 17:29:52 -05002615/*
2616 * Scan the list of snapshots and register the devices for any that
2617 * have not already been registered.
2618 */
2619static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2620{
2621 struct rbd_snap *snap;
2622 int ret = 0;
2623
2624 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002625 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2626 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002627
2628 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2629 if (!rbd_snap_registered(snap)) {
2630 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2631 if (ret < 0)
2632 break;
2633 }
2634 }
2635 dout("%s: returning %d\n", __func__, ret);
2636
2637 return ret;
2638}
2639
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002640static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2641{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002642 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002643 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002644
2645 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002646
Alex Eldercd789ab2012-08-30 00:16:38 -05002647 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002648 dev->bus = &rbd_bus_type;
2649 dev->type = &rbd_device_type;
2650 dev->parent = &rbd_root_dev;
2651 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002652 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002653 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002654
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002655 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002656
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002657 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002658}
2659
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002660static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2661{
2662 device_unregister(&rbd_dev->dev);
2663}
2664
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002665static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2666{
2667 int ret, rc;
2668
2669 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002670 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002671 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002672 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002673 if (rc < 0)
2674 return rc;
2675 }
2676 } while (ret == -ERANGE);
2677
2678 return ret;
2679}
2680
Alex Eldere2839302012-08-29 17:11:06 -05002681static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002682
2683/*
Alex Elder499afd52012-02-02 08:13:29 -06002684 * Get a unique rbd identifier for the given new rbd_dev, and add
2685 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002686 */
Alex Eldere2839302012-08-29 17:11:06 -05002687static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002688{
Alex Eldere2839302012-08-29 17:11:06 -05002689 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002690
2691 spin_lock(&rbd_dev_list_lock);
2692 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2693 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002694 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2695 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002696}
Alex Elderb7f23c32012-01-29 13:57:43 -06002697
Alex Elder1ddbe942012-01-29 13:57:44 -06002698/*
Alex Elder499afd52012-02-02 08:13:29 -06002699 * Remove an rbd_dev from the global list, and record that its
2700 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002701 */
Alex Eldere2839302012-08-29 17:11:06 -05002702static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002703{
Alex Elderd184f6b2012-01-29 13:57:44 -06002704 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002705 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002706 int max_id;
2707
Alex Elderaafb230e2012-09-06 16:00:54 -05002708 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002709
Alex Eldere2839302012-08-29 17:11:06 -05002710 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2711 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002712 spin_lock(&rbd_dev_list_lock);
2713 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002714
2715 /*
2716 * If the id being "put" is not the current maximum, there
2717 * is nothing special we need to do.
2718 */
Alex Eldere2839302012-08-29 17:11:06 -05002719 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002720 spin_unlock(&rbd_dev_list_lock);
2721 return;
2722 }
2723
2724 /*
2725 * We need to update the current maximum id. Search the
2726 * list to find out what it is. We're more likely to find
2727 * the maximum at the end, so search the list backward.
2728 */
2729 max_id = 0;
2730 list_for_each_prev(tmp, &rbd_dev_list) {
2731 struct rbd_device *rbd_dev;
2732
2733 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002734 if (rbd_dev->dev_id > max_id)
2735 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002736 }
Alex Elder499afd52012-02-02 08:13:29 -06002737 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002738
Alex Elder1ddbe942012-01-29 13:57:44 -06002739 /*
Alex Eldere2839302012-08-29 17:11:06 -05002740 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002741 * which case it now accurately reflects the new maximum.
2742 * Be careful not to overwrite the maximum value in that
2743 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002744 */
Alex Eldere2839302012-08-29 17:11:06 -05002745 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2746 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002747}
2748
Alex Eldera725f65e2012-02-02 08:13:30 -06002749/*
Alex Eldere28fff262012-02-02 08:13:30 -06002750 * Skips over white space at *buf, and updates *buf to point to the
2751 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002752 * the token (string of non-white space characters) found. Note
2753 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002754 */
2755static inline size_t next_token(const char **buf)
2756{
2757 /*
2758 * These are the characters that produce nonzero for
2759 * isspace() in the "C" and "POSIX" locales.
2760 */
2761 const char *spaces = " \f\n\r\t\v";
2762
2763 *buf += strspn(*buf, spaces); /* Find start of token */
2764
2765 return strcspn(*buf, spaces); /* Return token length */
2766}
2767
2768/*
2769 * Finds the next token in *buf, and if the provided token buffer is
2770 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002771 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2772 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002773 *
2774 * Returns the length of the token found (not including the '\0').
2775 * Return value will be 0 if no token is found, and it will be >=
2776 * token_size if the token would not fit.
2777 *
Alex Elder593a9e72012-02-07 12:03:37 -06002778 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002779 * found token. Note that this occurs even if the token buffer is
2780 * too small to hold it.
2781 */
2782static inline size_t copy_token(const char **buf,
2783 char *token,
2784 size_t token_size)
2785{
2786 size_t len;
2787
2788 len = next_token(buf);
2789 if (len < token_size) {
2790 memcpy(token, *buf, len);
2791 *(token + len) = '\0';
2792 }
2793 *buf += len;
2794
2795 return len;
2796}
2797
2798/*
Alex Elderea3352f2012-07-09 21:04:23 -05002799 * Finds the next token in *buf, dynamically allocates a buffer big
2800 * enough to hold a copy of it, and copies the token into the new
2801 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2802 * that a duplicate buffer is created even for a zero-length token.
2803 *
2804 * Returns a pointer to the newly-allocated duplicate, or a null
2805 * pointer if memory for the duplicate was not available. If
2806 * the lenp argument is a non-null pointer, the length of the token
2807 * (not including the '\0') is returned in *lenp.
2808 *
2809 * If successful, the *buf pointer will be updated to point beyond
2810 * the end of the found token.
2811 *
2812 * Note: uses GFP_KERNEL for allocation.
2813 */
2814static inline char *dup_token(const char **buf, size_t *lenp)
2815{
2816 char *dup;
2817 size_t len;
2818
2819 len = next_token(buf);
2820 dup = kmalloc(len + 1, GFP_KERNEL);
2821 if (!dup)
2822 return NULL;
2823
2824 memcpy(dup, *buf, len);
2825 *(dup + len) = '\0';
2826 *buf += len;
2827
2828 if (lenp)
2829 *lenp = len;
2830
2831 return dup;
2832}
2833
2834/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002835 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2836 * rbd_md_name, and name fields of the given rbd_dev, based on the
2837 * list of monitor addresses and other options provided via
2838 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2839 * copy of the snapshot name to map if successful, or a
2840 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002841 *
2842 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002843 */
Alex Elder0ddebc02012-10-25 23:34:41 -05002844static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
Alex Elder819d52b2012-10-25 23:34:41 -05002845 const char *buf)
Alex Eldera725f65e2012-02-02 08:13:30 -06002846{
Alex Elderd22f76e2012-07-12 10:46:35 -05002847 size_t len;
Alex Elder0ddebc02012-10-25 23:34:41 -05002848 const char *mon_addrs;
2849 size_t mon_addrs_size;
Alex Elderf28e5652012-10-25 23:34:41 -05002850 char *options;
2851 struct ceph_options *err_ptr = ERR_PTR(-EINVAL);
Alex Elder0ddebc02012-10-25 23:34:41 -05002852 struct rbd_options rbd_opts;
2853 struct ceph_options *ceph_opts;
Alex Eldere28fff262012-02-02 08:13:30 -06002854
2855 /* The first four tokens are required */
2856
Alex Elder7ef32142012-02-02 08:13:30 -06002857 len = next_token(&buf);
2858 if (!len)
Alex Elderf28e5652012-10-25 23:34:41 -05002859 return err_ptr; /* Missing monitor address(es) */
Alex Elder0ddebc02012-10-25 23:34:41 -05002860 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05002861 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002862 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002863
Alex Elderf28e5652012-10-25 23:34:41 -05002864 options = dup_token(&buf, NULL);
2865 if (!options)
2866 goto out_mem;
2867 if (!*options)
2868 goto out_err; /* Missing options */
Alex Eldera725f65e2012-02-02 08:13:30 -06002869
Alex Elderd22f76e2012-07-12 10:46:35 -05002870 rbd_dev->pool_name = dup_token(&buf, NULL);
2871 if (!rbd_dev->pool_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002872 goto out_mem;
2873 if (!*rbd_dev->pool_name)
2874 goto out_err; /* Missing pool name */
Alex Eldere28fff262012-02-02 08:13:30 -06002875
Alex Elder0bed54d2012-07-03 16:01:18 -05002876 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2877 if (!rbd_dev->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002878 goto out_mem;
2879 if (!*rbd_dev->image_name)
2880 goto out_err; /* Missing image name */
Alex Eldere28fff262012-02-02 08:13:30 -06002881
Alex Elderf28e5652012-10-25 23:34:41 -05002882 /*
2883 * Snapshot name is optional; default is to use "-"
2884 * (indicating the head/no snapshot).
2885 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002886 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002887 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002888 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2889 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05002890 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
2891 err_ptr = ERR_PTR(-ENAMETOOLONG);
2892 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05002893 }
Alex Elder819d52b2012-10-25 23:34:41 -05002894 rbd_dev->snap_name = kmalloc(len + 1, GFP_KERNEL);
2895 if (!rbd_dev->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002896 goto out_mem;
Alex Elder819d52b2012-10-25 23:34:41 -05002897 memcpy(rbd_dev->snap_name, buf, len);
2898 *(rbd_dev->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05002899
Alex Elder0ddebc02012-10-25 23:34:41 -05002900 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06002901
Alex Elder0ddebc02012-10-25 23:34:41 -05002902 rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05002903
Alex Elder0ddebc02012-10-25 23:34:41 -05002904 ceph_opts = ceph_parse_options(options, mon_addrs,
2905 mon_addrs + mon_addrs_size - 1,
2906 parse_rbd_opts_token, &rbd_opts);
Alex Elderf28e5652012-10-25 23:34:41 -05002907 kfree(options);
Alex Elder0ddebc02012-10-25 23:34:41 -05002908
2909 /* Record the parsed rbd options */
2910
Alex Elderf28e5652012-10-25 23:34:41 -05002911 if (!IS_ERR(ceph_opts))
Alex Elder0ddebc02012-10-25 23:34:41 -05002912 rbd_dev->mapping.read_only = rbd_opts.read_only;
Alex Elder0ddebc02012-10-25 23:34:41 -05002913
2914 return ceph_opts;
Alex Elderf28e5652012-10-25 23:34:41 -05002915out_mem:
2916 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002917out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002918 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002919 rbd_dev->image_name = NULL;
2920 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002921 kfree(rbd_dev->pool_name);
2922 rbd_dev->pool_name = NULL;
Alex Elderf28e5652012-10-25 23:34:41 -05002923 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05002924
Alex Elder3feeb8942012-08-31 17:29:52 -05002925 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002926}
2927
Alex Elder589d30e2012-07-10 20:30:11 -05002928/*
2929 * An rbd format 2 image has a unique identifier, distinct from the
2930 * name given to it by the user. Internally, that identifier is
2931 * what's used to specify the names of objects related to the image.
2932 *
2933 * A special "rbd id" object is used to map an rbd image name to its
2934 * id. If that object doesn't exist, then there is no v2 rbd image
2935 * with the supplied name.
2936 *
2937 * This function will record the given rbd_dev's image_id field if
2938 * it can be determined, and in that case will return 0. If any
2939 * errors occur a negative errno will be returned and the rbd_dev's
2940 * image_id field will be unchanged (and should be NULL).
2941 */
2942static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2943{
2944 int ret;
2945 size_t size;
2946 char *object_name;
2947 void *response;
2948 void *p;
2949
2950 /*
2951 * First, see if the format 2 image id file exists, and if
2952 * so, get the image's persistent id from it.
2953 */
2954 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2955 object_name = kmalloc(size, GFP_NOIO);
2956 if (!object_name)
2957 return -ENOMEM;
2958 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2959 dout("rbd id object name is %s\n", object_name);
2960
2961 /* Response will be an encoded string, which includes a length */
2962
2963 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2964 response = kzalloc(size, GFP_NOIO);
2965 if (!response) {
2966 ret = -ENOMEM;
2967 goto out;
2968 }
2969
2970 ret = rbd_req_sync_exec(rbd_dev, object_name,
2971 "rbd", "get_id",
2972 NULL, 0,
2973 response, RBD_IMAGE_ID_LEN_MAX,
2974 CEPH_OSD_FLAG_READ, NULL);
2975 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2976 if (ret < 0)
2977 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002978 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05002979
2980 p = response;
2981 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2982 p + RBD_IMAGE_ID_LEN_MAX,
2983 &rbd_dev->image_id_len,
2984 GFP_NOIO);
2985 if (IS_ERR(rbd_dev->image_id)) {
2986 ret = PTR_ERR(rbd_dev->image_id);
2987 rbd_dev->image_id = NULL;
2988 } else {
2989 dout("image_id is %s\n", rbd_dev->image_id);
2990 }
2991out:
2992 kfree(response);
2993 kfree(object_name);
2994
2995 return ret;
2996}
2997
Alex Eldera30b71b2012-07-10 20:30:11 -05002998static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2999{
3000 int ret;
3001 size_t size;
3002
3003 /* Version 1 images have no id; empty string is used */
3004
3005 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
3006 if (!rbd_dev->image_id)
3007 return -ENOMEM;
3008 rbd_dev->image_id_len = 0;
3009
3010 /* Record the header object name for this rbd image. */
3011
3012 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
3013 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3014 if (!rbd_dev->header_name) {
3015 ret = -ENOMEM;
3016 goto out_err;
3017 }
3018 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
3019
3020 /* Populate rbd image metadata */
3021
3022 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3023 if (ret < 0)
3024 goto out_err;
3025 rbd_dev->image_format = 1;
3026
3027 dout("discovered version 1 image, header name is %s\n",
3028 rbd_dev->header_name);
3029
3030 return 0;
3031
3032out_err:
3033 kfree(rbd_dev->header_name);
3034 rbd_dev->header_name = NULL;
3035 kfree(rbd_dev->image_id);
3036 rbd_dev->image_id = NULL;
3037
3038 return ret;
3039}
3040
3041static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3042{
3043 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003044 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003045 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003046
3047 /*
3048 * Image id was filled in by the caller. Record the header
3049 * object name for this rbd image.
3050 */
3051 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
3052 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3053 if (!rbd_dev->header_name)
3054 return -ENOMEM;
3055 sprintf(rbd_dev->header_name, "%s%s",
3056 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003057
3058 /* Get the size and object order for the image */
3059
3060 ret = rbd_dev_v2_image_size(rbd_dev);
3061 if (ret < 0)
3062 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003063
3064 /* Get the object prefix (a.k.a. block_name) for the image */
3065
3066 ret = rbd_dev_v2_object_prefix(rbd_dev);
3067 if (ret < 0)
3068 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003069
Alex Elderd8891402012-10-09 13:50:17 -07003070 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003071
3072 ret = rbd_dev_v2_features(rbd_dev);
3073 if (ret < 0)
3074 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003075
Alex Elder6e14b1a2012-07-03 16:01:19 -05003076 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003077
Alex Elder6e14b1a2012-07-03 16:01:19 -05003078 rbd_dev->header.crypt_type = 0;
3079 rbd_dev->header.comp_type = 0;
3080
3081 /* Get the snapshot context, plus the header version */
3082
3083 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003084 if (ret)
3085 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003086 rbd_dev->header.obj_version = ver;
3087
Alex Eldera30b71b2012-07-10 20:30:11 -05003088 rbd_dev->image_format = 2;
3089
3090 dout("discovered version 2 image, header name is %s\n",
3091 rbd_dev->header_name);
3092
Alex Elder35152972012-08-31 17:29:55 -05003093 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003094out_err:
3095 kfree(rbd_dev->header_name);
3096 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003097 kfree(rbd_dev->header.object_prefix);
3098 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003099
3100 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003101}
3102
3103/*
3104 * Probe for the existence of the header object for the given rbd
3105 * device. For format 2 images this includes determining the image
3106 * id.
3107 */
3108static int rbd_dev_probe(struct rbd_device *rbd_dev)
3109{
3110 int ret;
3111
3112 /*
3113 * Get the id from the image id object. If it's not a
3114 * format 2 image, we'll get ENOENT back, and we'll assume
3115 * it's a format 1 image.
3116 */
3117 ret = rbd_dev_image_id(rbd_dev);
3118 if (ret)
3119 ret = rbd_dev_v1_probe(rbd_dev);
3120 else
3121 ret = rbd_dev_v2_probe(rbd_dev);
3122 if (ret)
3123 dout("probe failed, returning %d\n", ret);
3124
3125 return ret;
3126}
3127
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003128static ssize_t rbd_add(struct bus_type *bus,
3129 const char *buf,
3130 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003131{
Alex Eldercb8627c2012-07-09 21:04:23 -05003132 struct rbd_device *rbd_dev = NULL;
Alex Elder78cea762012-10-25 23:34:41 -05003133 struct ceph_options *ceph_opts;
Alex Elder27cc2592012-02-02 08:13:30 -06003134 struct ceph_osd_client *osdc;
3135 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003136
3137 if (!try_module_get(THIS_MODULE))
3138 return -ENODEV;
3139
Alex Eldercb8627c2012-07-09 21:04:23 -05003140 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3141 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05003142 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003143
3144 /* static rbd_device initialization */
3145 spin_lock_init(&rbd_dev->lock);
3146 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003147 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08003148 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003149
Alex Eldera725f65e2012-02-02 08:13:30 -06003150 /* parse add command */
Alex Elder819d52b2012-10-25 23:34:41 -05003151 ceph_opts = rbd_add_parse_args(rbd_dev, buf);
Alex Elder0ddebc02012-10-25 23:34:41 -05003152 if (IS_ERR(ceph_opts)) {
3153 rc = PTR_ERR(ceph_opts);
Alex Elder85ae8922012-07-26 23:37:14 -05003154 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05003155 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003156
Alex Elder78cea762012-10-25 23:34:41 -05003157 rc = rbd_get_client(rbd_dev, ceph_opts);
3158 if (rc < 0)
Alex Elder0ddebc02012-10-25 23:34:41 -05003159 goto err_out_args;
Alex Elder78cea762012-10-25 23:34:41 -05003160 ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003161
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003162 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06003163 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003164 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3165 if (rc < 0)
3166 goto err_out_client;
Alex Elder86992092012-10-25 23:34:41 -05003167 rbd_dev->pool_id = (u64) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003168
Alex Eldera30b71b2012-07-10 20:30:11 -05003169 rc = rbd_dev_probe(rbd_dev);
3170 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05003171 goto err_out_client;
Alex Elder05fd6f62012-08-29 17:11:07 -05003172
3173 /* no need to lock here, as rbd_dev is not registered yet */
3174 rc = rbd_dev_snaps_update(rbd_dev);
3175 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003176 goto err_out_probe;
Alex Elder05fd6f62012-08-29 17:11:07 -05003177
Alex Elder819d52b2012-10-25 23:34:41 -05003178 rc = rbd_dev_set_mapping(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05003179 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003180 goto err_out_snaps;
Alex Elder05fd6f62012-08-29 17:11:07 -05003181
Alex Elder85ae8922012-07-26 23:37:14 -05003182 /* generate unique id: find highest unique id, add one */
3183 rbd_dev_id_get(rbd_dev);
3184
3185 /* Fill in the device name, now that we have its id. */
3186 BUILD_BUG_ON(DEV_NAME_LEN
3187 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3188 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3189
3190 /* Get our block major device number. */
3191
Alex Elder27cc2592012-02-02 08:13:30 -06003192 rc = register_blkdev(0, rbd_dev->name);
3193 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003194 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003195 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003196
Alex Elder0f308a32012-08-29 17:11:07 -05003197 /* Set up the blkdev mapping. */
3198
3199 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003200 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003201 goto err_out_blkdev;
3202
Alex Elder0f308a32012-08-29 17:11:07 -05003203 rc = rbd_bus_add_dev(rbd_dev);
3204 if (rc)
3205 goto err_out_disk;
3206
Alex Elder32eec682012-02-08 16:11:14 -06003207 /*
3208 * At this point cleanup in the event of an error is the job
3209 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003210 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003211
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003212 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003213 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003214 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003215 if (rc)
3216 goto err_out_bus;
3217
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003218 rc = rbd_init_watch_dev(rbd_dev);
3219 if (rc)
3220 goto err_out_bus;
3221
Alex Elder3ee40012012-08-29 17:11:07 -05003222 /* Everything's ready. Announce the disk to the world. */
3223
3224 add_disk(rbd_dev->disk);
3225
3226 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3227 (unsigned long long) rbd_dev->mapping.size);
3228
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003229 return count;
3230
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003231err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003232 /* this will also clean up rest of rbd_dev stuff */
3233
3234 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003235 return rc;
3236
Alex Elder0f308a32012-08-29 17:11:07 -05003237err_out_disk:
3238 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003239err_out_blkdev:
3240 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003241err_out_id:
3242 rbd_dev_id_put(rbd_dev);
Alex Elder41f38c22012-10-25 23:34:40 -05003243err_out_snaps:
3244 rbd_remove_all_snaps(rbd_dev);
3245err_out_probe:
Alex Elder05fd6f62012-08-29 17:11:07 -05003246 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003247err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05003248 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003249 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05003250 kfree(rbd_dev->image_id);
Alex Elder0ddebc02012-10-25 23:34:41 -05003251err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003252 if (ceph_opts)
3253 ceph_destroy_options(ceph_opts);
Alex Elder971f8392012-10-25 23:34:41 -05003254 kfree(rbd_dev->snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003255 kfree(rbd_dev->image_name);
3256 kfree(rbd_dev->pool_name);
3257err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06003258 kfree(rbd_dev);
3259
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003260 dout("Error adding device %s\n", buf);
3261 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003262
3263 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003264}
3265
Alex Elderde71a292012-07-03 16:01:19 -05003266static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003267{
3268 struct list_head *tmp;
3269 struct rbd_device *rbd_dev;
3270
Alex Eldere124a822012-01-29 13:57:44 -06003271 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003272 list_for_each(tmp, &rbd_dev_list) {
3273 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003274 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003275 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003276 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003277 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003278 }
Alex Eldere124a822012-01-29 13:57:44 -06003279 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003280 return NULL;
3281}
3282
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003283static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003284{
Alex Elder593a9e72012-02-07 12:03:37 -06003285 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003286
Alex Elder1dbb4392012-01-24 10:08:37 -06003287 if (rbd_dev->watch_request) {
3288 struct ceph_client *client = rbd_dev->rbd_client->client;
3289
3290 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003291 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003292 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003293 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003294 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003295
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003296 rbd_put_client(rbd_dev);
3297
3298 /* clean up and free blkdev */
3299 rbd_free_disk(rbd_dev);
3300 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003301
Alex Elder2ac4e752012-07-10 20:30:10 -05003302 /* release allocated disk header fields */
3303 rbd_header_free(&rbd_dev->header);
3304
Alex Elder32eec682012-02-08 16:11:14 -06003305 /* done with the id, and with the rbd_dev */
Alex Elder971f8392012-10-25 23:34:41 -05003306 kfree(rbd_dev->snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003307 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05003308 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05003309 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05003310 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05003311 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003312 kfree(rbd_dev);
3313
3314 /* release module ref */
3315 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003316}
3317
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003318static ssize_t rbd_remove(struct bus_type *bus,
3319 const char *buf,
3320 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003321{
3322 struct rbd_device *rbd_dev = NULL;
3323 int target_id, rc;
3324 unsigned long ul;
3325 int ret = count;
3326
3327 rc = strict_strtoul(buf, 10, &ul);
3328 if (rc)
3329 return rc;
3330
3331 /* convert to int; abort if we lost anything in the conversion */
3332 target_id = (int) ul;
3333 if (target_id != ul)
3334 return -EINVAL;
3335
3336 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3337
3338 rbd_dev = __rbd_get_dev(target_id);
3339 if (!rbd_dev) {
3340 ret = -ENOENT;
3341 goto done;
3342 }
3343
Alex Elder41f38c22012-10-25 23:34:40 -05003344 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003345 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003346
3347done:
3348 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05003349
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003350 return ret;
3351}
3352
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003353/*
3354 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003355 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003356 */
3357static int rbd_sysfs_init(void)
3358{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003359 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003360
Alex Elderfed4c142012-02-07 12:03:36 -06003361 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003362 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003363 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003364
Alex Elderfed4c142012-02-07 12:03:36 -06003365 ret = bus_register(&rbd_bus_type);
3366 if (ret < 0)
3367 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003368
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003369 return ret;
3370}
3371
3372static void rbd_sysfs_cleanup(void)
3373{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003374 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003375 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003376}
3377
3378int __init rbd_init(void)
3379{
3380 int rc;
3381
3382 rc = rbd_sysfs_init();
3383 if (rc)
3384 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003385 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003386 return 0;
3387}
3388
3389void __exit rbd_exit(void)
3390{
3391 rbd_sysfs_cleanup();
3392}
3393
3394module_init(rbd_init);
3395module_exit(rbd_exit);
3396
3397MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3398MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3399MODULE_DESCRIPTION("rados block device");
3400
3401/* following authorship retained from original osdblk.c */
3402MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3403
3404MODULE_LICENSE("GPL");