blob: 89576a0b3f2ed5b099ecf7ce675104d282b0dd48 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder9e15b772012-10-30 19:40:33 -050073/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050075#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050076
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050078
Alex Elderd8891402012-10-09 13:50:17 -070079/* Feature bits */
80
81#define RBD_FEATURE_LAYERING 1
82
83/* Features supported by this (client software) implementation. */
84
85#define RBD_FEATURES_ALL (0)
86
Alex Elder81a89792012-02-02 08:13:30 -060087/*
88 * An RBD device name will be "rbd#", where the "rbd" comes from
89 * RBD_DRV_NAME above, and # is a unique integer identifier.
90 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
91 * enough to hold all possible device names.
92 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060094#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095
Alex Eldercc0538b2012-08-10 13:12:07 -070096#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070097
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122 * identify an image.
123 */
124struct rbd_spec {
125 u64 pool_id;
126 char *pool_name;
127
128 char *image_id;
129 size_t image_id_len;
130 char *image_name;
131 size_t image_name_len;
132
133 u64 snap_id;
134 char *snap_name;
135
136 struct kref kref;
137};
138
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700139struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700140 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700141};
142
143/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600144 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145 */
146struct rbd_client {
147 struct ceph_client *client;
148 struct kref kref;
149 struct list_head node;
150};
151
152/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600153 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700155struct rbd_req_status {
156 int done;
157 int rc;
158 u64 bytes;
159};
160
161/*
162 * a collection of requests
163 */
164struct rbd_req_coll {
165 int total;
166 int num_done;
167 struct kref kref;
168 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169};
170
Alex Elderf0f8cef2012-01-29 13:57:44 -0600171/*
172 * a single io request
173 */
174struct rbd_request {
175 struct request *rq; /* blk layer request */
176 struct bio *bio; /* cloned bio */
177 struct page **pages; /* list of used pages */
178 u64 len;
179 int coll_index;
180 struct rbd_req_coll *coll;
181};
182
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183struct rbd_snap {
184 struct device dev;
185 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800186 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800187 struct list_head node;
188 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500189 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800190};
191
Alex Elderf84344f2012-08-31 17:29:51 -0500192struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500193 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500194 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500195 bool read_only;
196};
197
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198/*
199 * a single device
200 */
201struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500202 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700203
204 int major; /* blkdev assigned major */
205 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700206
Alex Eldera30b71b2012-07-10 20:30:11 -0500207 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700208 struct rbd_client *rbd_client;
209
210 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
211
212 spinlock_t lock; /* queue lock */
213
214 struct rbd_image_header header;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500215 bool exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500216 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700217
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500218 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500219
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700220 struct ceph_osd_event *watch_event;
221 struct ceph_osd_request *watch_request;
222
Alex Elder86b00e02012-10-25 23:34:42 -0500223 struct rbd_spec *parent_spec;
224 u64 parent_overlap;
225
Josh Durginc6666012011-11-21 17:11:12 -0800226 /* protects updating the header */
227 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500228
229 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700230
231 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232
233 /* list of snapshots */
234 struct list_head snaps;
235
236 /* sysfs related */
237 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600238 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239};
240
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700241static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600242
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700243static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600244static DEFINE_SPINLOCK(rbd_dev_list_lock);
245
Alex Elder432b8582012-01-29 13:57:44 -0600246static LIST_HEAD(rbd_client_list); /* clients */
247static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248
Alex Elder304f6802012-08-31 17:29:52 -0500249static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
250static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
251
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800252static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500253static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800254
Alex Elderf0f8cef2012-01-29 13:57:44 -0600255static ssize_t rbd_add(struct bus_type *bus, const char *buf,
256 size_t count);
257static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
258 size_t count);
259
260static struct bus_attribute rbd_bus_attrs[] = {
261 __ATTR(add, S_IWUSR, NULL, rbd_add),
262 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
263 __ATTR_NULL
264};
265
266static struct bus_type rbd_bus_type = {
267 .name = "rbd",
268 .bus_attrs = rbd_bus_attrs,
269};
270
271static void rbd_root_dev_release(struct device *dev)
272{
273}
274
275static struct device rbd_root_dev = {
276 .init_name = "rbd",
277 .release = rbd_root_dev_release,
278};
279
Alex Elderaafb230e2012-09-06 16:00:54 -0500280#ifdef RBD_DEBUG
281#define rbd_assert(expr) \
282 if (unlikely(!(expr))) { \
283 printk(KERN_ERR "\nAssertion failure in %s() " \
284 "at line %d:\n\n" \
285 "\trbd_assert(%s);\n\n", \
286 __func__, __LINE__, #expr); \
287 BUG(); \
288 }
289#else /* !RBD_DEBUG */
290# define rbd_assert(expr) ((void) 0)
291#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800292
Alex Elder117973f2012-08-31 17:29:55 -0500293static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
294static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700295
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296static int rbd_open(struct block_device *bdev, fmode_t mode)
297{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600298 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299
Alex Elderf84344f2012-08-31 17:29:51 -0500300 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700301 return -EROFS;
302
Alex Elder42382b72012-11-16 09:29:16 -0600303 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600304 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500305 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600306 rbd_dev->open_count++;
307 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700308
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309 return 0;
310}
311
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800312static int rbd_release(struct gendisk *disk, fmode_t mode)
313{
314 struct rbd_device *rbd_dev = disk->private_data;
315
Alex Elder42382b72012-11-16 09:29:16 -0600316 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
317 rbd_assert(rbd_dev->open_count > 0);
318 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600319 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600320 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800321
322 return 0;
323}
324
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700325static const struct block_device_operations rbd_bd_ops = {
326 .owner = THIS_MODULE,
327 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800328 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700329};
330
331/*
332 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500333 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 */
Alex Elderf8c38922012-08-10 13:12:07 -0700335static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336{
337 struct rbd_client *rbdc;
338 int ret = -ENOMEM;
339
340 dout("rbd_client_create\n");
341 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
342 if (!rbdc)
343 goto out_opt;
344
345 kref_init(&rbdc->kref);
346 INIT_LIST_HEAD(&rbdc->node);
347
Alex Elderbc534d82012-01-29 13:57:44 -0600348 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
349
Alex Elder43ae4702012-07-03 16:01:18 -0500350 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600352 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500353 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354
355 ret = ceph_open_session(rbdc->client);
356 if (ret < 0)
357 goto out_err;
358
Alex Elder432b8582012-01-29 13:57:44 -0600359 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700360 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600361 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700362
Alex Elderbc534d82012-01-29 13:57:44 -0600363 mutex_unlock(&ctl_mutex);
364
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700365 dout("rbd_client_create created %p\n", rbdc);
366 return rbdc;
367
368out_err:
369 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600370out_mutex:
371 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700372 kfree(rbdc);
373out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500374 if (ceph_opts)
375 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400376 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377}
378
379/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700380 * Find a ceph client with specific addr and configuration. If
381 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700383static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700384{
385 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700386 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700387
Alex Elder43ae4702012-07-03 16:01:18 -0500388 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700389 return NULL;
390
Alex Elder1f7ba332012-08-10 13:12:07 -0700391 spin_lock(&rbd_client_list_lock);
392 list_for_each_entry(client_node, &rbd_client_list, node) {
393 if (!ceph_compare_options(ceph_opts, client_node->client)) {
394 kref_get(&client_node->kref);
395 found = true;
396 break;
397 }
398 }
399 spin_unlock(&rbd_client_list_lock);
400
401 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700402}
403
404/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700405 * mount options
406 */
407enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408 Opt_last_int,
409 /* int args above */
410 Opt_last_string,
411 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700412 Opt_read_only,
413 Opt_read_write,
414 /* Boolean args above */
415 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700416};
417
Alex Elder43ae4702012-07-03 16:01:18 -0500418static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700419 /* int args above */
420 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500421 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700422 {Opt_read_only, "ro"}, /* Alternate spelling */
423 {Opt_read_write, "read_write"},
424 {Opt_read_write, "rw"}, /* Alternate spelling */
425 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700426 {-1, NULL}
427};
428
429static int parse_rbd_opts_token(char *c, void *private)
430{
Alex Elder43ae4702012-07-03 16:01:18 -0500431 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700432 substring_t argstr[MAX_OPT_ARGS];
433 int token, intval, ret;
434
Alex Elder43ae4702012-07-03 16:01:18 -0500435 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700436 if (token < 0)
437 return -EINVAL;
438
439 if (token < Opt_last_int) {
440 ret = match_int(&argstr[0], &intval);
441 if (ret < 0) {
442 pr_err("bad mount option arg (not int) "
443 "at '%s'\n", c);
444 return ret;
445 }
446 dout("got int token %d val %d\n", token, intval);
447 } else if (token > Opt_last_int && token < Opt_last_string) {
448 dout("got string token %d val %s\n", token,
449 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700450 } else if (token > Opt_last_string && token < Opt_last_bool) {
451 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700452 } else {
453 dout("got token %d\n", token);
454 }
455
456 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700457 case Opt_read_only:
458 rbd_opts->read_only = true;
459 break;
460 case Opt_read_write:
461 rbd_opts->read_only = false;
462 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700463 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500464 rbd_assert(false);
465 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700466 }
467 return 0;
468}
469
470/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471 * Get a ceph client with specific addr and configuration, if one does
472 * not exist create it.
473 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500474static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700475{
Alex Elderf8c38922012-08-10 13:12:07 -0700476 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700477
Alex Elder1f7ba332012-08-10 13:12:07 -0700478 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500479 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500480 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500481 else
Alex Elderf8c38922012-08-10 13:12:07 -0700482 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483
Alex Elder9d3997f2012-10-25 23:34:42 -0500484 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700485}
486
487/*
488 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600489 *
Alex Elder432b8582012-01-29 13:57:44 -0600490 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700491 */
492static void rbd_client_release(struct kref *kref)
493{
494 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
495
496 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500497 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700498 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500499 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700500
501 ceph_destroy_client(rbdc->client);
502 kfree(rbdc);
503}
504
505/*
506 * Drop reference to ceph client node. If it's not referenced anymore, release
507 * it.
508 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500509static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700510{
Alex Elderc53d5892012-10-25 23:34:42 -0500511 if (rbdc)
512 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700513}
514
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700515/*
516 * Destroy requests collection
517 */
518static void rbd_coll_release(struct kref *kref)
519{
520 struct rbd_req_coll *coll =
521 container_of(kref, struct rbd_req_coll, kref);
522
523 dout("rbd_coll_release %p\n", coll);
524 kfree(coll);
525}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700526
Alex Eldera30b71b2012-07-10 20:30:11 -0500527static bool rbd_image_format_valid(u32 image_format)
528{
529 return image_format == 1 || image_format == 2;
530}
531
Alex Elder8e94af82012-07-25 09:32:40 -0500532static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
533{
Alex Elder103a1502012-08-02 11:29:45 -0500534 size_t size;
535 u32 snap_count;
536
537 /* The header has to start with the magic rbd header text */
538 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
539 return false;
540
Alex Elderdb2388b2012-10-20 22:17:27 -0500541 /* The bio layer requires at least sector-sized I/O */
542
543 if (ondisk->options.order < SECTOR_SHIFT)
544 return false;
545
546 /* If we use u64 in a few spots we may be able to loosen this */
547
548 if (ondisk->options.order > 8 * sizeof (int) - 1)
549 return false;
550
Alex Elder103a1502012-08-02 11:29:45 -0500551 /*
552 * The size of a snapshot header has to fit in a size_t, and
553 * that limits the number of snapshots.
554 */
555 snap_count = le32_to_cpu(ondisk->snap_count);
556 size = SIZE_MAX - sizeof (struct ceph_snap_context);
557 if (snap_count > size / sizeof (__le64))
558 return false;
559
560 /*
561 * Not only that, but the size of the entire the snapshot
562 * header must also be representable in a size_t.
563 */
564 size -= snap_count * sizeof (__le64);
565 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
566 return false;
567
568 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500569}
570
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700571/*
572 * Create a new header structure, translate header format from the on-disk
573 * header.
574 */
575static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500576 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700577{
Alex Elderccece232012-07-10 20:30:10 -0500578 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500579 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500580 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500581 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700582
Alex Elder6a523252012-07-19 17:12:59 -0500583 memset(header, 0, sizeof (*header));
584
Alex Elder103a1502012-08-02 11:29:45 -0500585 snap_count = le32_to_cpu(ondisk->snap_count);
586
Alex Elder58c17b02012-08-23 23:22:06 -0500587 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
588 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500589 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500591 memcpy(header->object_prefix, ondisk->object_prefix, len);
592 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600593
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700594 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500595 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
596
Alex Elder621901d2012-08-23 23:22:06 -0500597 /* Save a copy of the snapshot names */
598
Alex Elderf785cc12012-08-23 23:22:06 -0500599 if (snap_names_len > (u64) SIZE_MAX)
600 return -EIO;
601 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700602 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500603 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500604 /*
605 * Note that rbd_dev_v1_header_read() guarantees
606 * the ondisk buffer we're working with has
607 * snap_names_len bytes beyond the end of the
608 * snapshot id array, this memcpy() is safe.
609 */
610 memcpy(header->snap_names, &ondisk->snaps[snap_count],
611 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500612
Alex Elder621901d2012-08-23 23:22:06 -0500613 /* Record each snapshot's size */
614
Alex Elderd2bb24e2012-07-26 23:37:14 -0500615 size = snap_count * sizeof (*header->snap_sizes);
616 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700617 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500618 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500619 for (i = 0; i < snap_count; i++)
620 header->snap_sizes[i] =
621 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622 } else {
Alex Elderccece232012-07-10 20:30:10 -0500623 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624 header->snap_names = NULL;
625 header->snap_sizes = NULL;
626 }
Alex Elder849b4262012-07-09 21:04:24 -0500627
Alex Elder34b13182012-07-13 20:35:12 -0500628 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629 header->obj_order = ondisk->options.order;
630 header->crypt_type = ondisk->options.crypt_type;
631 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500632
Alex Elder621901d2012-08-23 23:22:06 -0500633 /* Allocate and fill in the snapshot context */
634
Alex Elderf84344f2012-08-31 17:29:51 -0500635 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500636 size = sizeof (struct ceph_snap_context);
637 size += snap_count * sizeof (header->snapc->snaps[0]);
638 header->snapc = kzalloc(size, GFP_KERNEL);
639 if (!header->snapc)
640 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700641
642 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500643 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500645 for (i = 0; i < snap_count; i++)
646 header->snapc->snaps[i] =
647 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648
649 return 0;
650
Alex Elder6a523252012-07-19 17:12:59 -0500651out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500652 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500653 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700654 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500655 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500656 kfree(header->object_prefix);
657 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500658
Alex Elder00f1f362012-02-07 12:03:36 -0600659 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660}
661
Alex Elder9e15b772012-10-30 19:40:33 -0500662static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
663{
664 struct rbd_snap *snap;
665
666 if (snap_id == CEPH_NOSNAP)
667 return RBD_SNAP_HEAD_NAME;
668
669 list_for_each_entry(snap, &rbd_dev->snaps, node)
670 if (snap_id == snap->id)
671 return snap->name;
672
673 return NULL;
674}
675
Alex Elder8836b992012-08-30 14:42:15 -0500676static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678
Alex Eldere86924a2012-07-10 20:30:11 -0500679 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600680
Alex Eldere86924a2012-07-10 20:30:11 -0500681 list_for_each_entry(snap, &rbd_dev->snaps, node) {
682 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500683 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500684 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500685 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600686
Alex Eldere86924a2012-07-10 20:30:11 -0500687 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600688 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689 }
Alex Eldere86924a2012-07-10 20:30:11 -0500690
Alex Elder00f1f362012-02-07 12:03:36 -0600691 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692}
693
Alex Elder819d52b2012-10-25 23:34:41 -0500694static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695{
Alex Elder78dc4472012-07-19 08:49:18 -0500696 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500698 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800699 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500700 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500701 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500702 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500703 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500705 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706 if (ret < 0)
707 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500708 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709 }
Alex Elderdaba5fd2012-10-26 17:25:23 -0500710 rbd_dev->exists = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700711done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712 return ret;
713}
714
715static void rbd_header_free(struct rbd_image_header *header)
716{
Alex Elder849b4262012-07-09 21:04:24 -0500717 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500718 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500720 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500721 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500722 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800723 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500724 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700725}
726
Alex Elder65ccfe22012-08-09 10:33:26 -0700727static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700728{
Alex Elder65ccfe22012-08-09 10:33:26 -0700729 char *name;
730 u64 segment;
731 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700732
Alex Elder2fd82b92012-11-09 15:05:54 -0600733 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700734 if (!name)
735 return NULL;
736 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600737 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700738 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600739 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700740 pr_err("error formatting segment name for #%llu (%d)\n",
741 segment, ret);
742 kfree(name);
743 name = NULL;
744 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745
Alex Elder65ccfe22012-08-09 10:33:26 -0700746 return name;
747}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700748
Alex Elder65ccfe22012-08-09 10:33:26 -0700749static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
750{
751 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700752
Alex Elder65ccfe22012-08-09 10:33:26 -0700753 return offset & (segment_size - 1);
754}
755
756static u64 rbd_segment_length(struct rbd_device *rbd_dev,
757 u64 offset, u64 length)
758{
759 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
760
761 offset &= segment_size - 1;
762
Alex Elderaafb230e2012-09-06 16:00:54 -0500763 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700764 if (offset + length > segment_size)
765 length = segment_size - offset;
766
767 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700768}
769
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700770static int rbd_get_num_segments(struct rbd_image_header *header,
771 u64 ofs, u64 len)
772{
Alex Elderdf111be2012-08-09 10:33:26 -0700773 u64 start_seg;
774 u64 end_seg;
775
776 if (!len)
777 return 0;
778 if (len - 1 > U64_MAX - ofs)
779 return -ERANGE;
780
781 start_seg = ofs >> header->obj_order;
782 end_seg = (ofs + len - 1) >> header->obj_order;
783
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700784 return end_seg - start_seg + 1;
785}
786
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700788 * returns the size of an object in the image
789 */
790static u64 rbd_obj_bytes(struct rbd_image_header *header)
791{
792 return 1 << header->obj_order;
793}
794
795/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700796 * bio helpers
797 */
798
799static void bio_chain_put(struct bio *chain)
800{
801 struct bio *tmp;
802
803 while (chain) {
804 tmp = chain;
805 chain = chain->bi_next;
806 bio_put(tmp);
807 }
808}
809
810/*
811 * zeros a bio chain, starting at specific offset
812 */
813static void zero_bio_chain(struct bio *chain, int start_ofs)
814{
815 struct bio_vec *bv;
816 unsigned long flags;
817 void *buf;
818 int i;
819 int pos = 0;
820
821 while (chain) {
822 bio_for_each_segment(bv, chain, i) {
823 if (pos + bv->bv_len > start_ofs) {
824 int remainder = max(start_ofs - pos, 0);
825 buf = bvec_kmap_irq(bv, &flags);
826 memset(buf + remainder, 0,
827 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200828 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700829 }
830 pos += bv->bv_len;
831 }
832
833 chain = chain->bi_next;
834 }
835}
836
837/*
Alex Elderf7760da2012-10-20 22:17:27 -0500838 * Clone a portion of a bio, starting at the given byte offset
839 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700840 */
Alex Elderf7760da2012-10-20 22:17:27 -0500841static struct bio *bio_clone_range(struct bio *bio_src,
842 unsigned int offset,
843 unsigned int len,
844 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845{
Alex Elderf7760da2012-10-20 22:17:27 -0500846 struct bio_vec *bv;
847 unsigned int resid;
848 unsigned short idx;
849 unsigned int voff;
850 unsigned short end_idx;
851 unsigned short vcnt;
852 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700853
Alex Elderf7760da2012-10-20 22:17:27 -0500854 /* Handle the easy case for the caller */
855
856 if (!offset && len == bio_src->bi_size)
857 return bio_clone(bio_src, gfpmask);
858
859 if (WARN_ON_ONCE(!len))
860 return NULL;
861 if (WARN_ON_ONCE(len > bio_src->bi_size))
862 return NULL;
863 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
864 return NULL;
865
866 /* Find first affected segment... */
867
868 resid = offset;
869 __bio_for_each_segment(bv, bio_src, idx, 0) {
870 if (resid < bv->bv_len)
871 break;
872 resid -= bv->bv_len;
873 }
874 voff = resid;
875
876 /* ...and the last affected segment */
877
878 resid += len;
879 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
880 if (resid <= bv->bv_len)
881 break;
882 resid -= bv->bv_len;
883 }
884 vcnt = end_idx - idx + 1;
885
886 /* Build the clone */
887
888 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
889 if (!bio)
890 return NULL; /* ENOMEM */
891
892 bio->bi_bdev = bio_src->bi_bdev;
893 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
894 bio->bi_rw = bio_src->bi_rw;
895 bio->bi_flags |= 1 << BIO_CLONED;
896
897 /*
898 * Copy over our part of the bio_vec, then update the first
899 * and last (or only) entries.
900 */
901 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
902 vcnt * sizeof (struct bio_vec));
903 bio->bi_io_vec[0].bv_offset += voff;
904 if (vcnt > 1) {
905 bio->bi_io_vec[0].bv_len -= voff;
906 bio->bi_io_vec[vcnt - 1].bv_len = resid;
907 } else {
908 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700909 }
910
Alex Elderf7760da2012-10-20 22:17:27 -0500911 bio->bi_vcnt = vcnt;
912 bio->bi_size = len;
913 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700914
Alex Elderf7760da2012-10-20 22:17:27 -0500915 return bio;
916}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700917
Alex Elderf7760da2012-10-20 22:17:27 -0500918/*
919 * Clone a portion of a bio chain, starting at the given byte offset
920 * into the first bio in the source chain and continuing for the
921 * number of bytes indicated. The result is another bio chain of
922 * exactly the given length, or a null pointer on error.
923 *
924 * The bio_src and offset parameters are both in-out. On entry they
925 * refer to the first source bio and the offset into that bio where
926 * the start of data to be cloned is located.
927 *
928 * On return, bio_src is updated to refer to the bio in the source
929 * chain that contains first un-cloned byte, and *offset will
930 * contain the offset of that byte within that bio.
931 */
932static struct bio *bio_chain_clone_range(struct bio **bio_src,
933 unsigned int *offset,
934 unsigned int len,
935 gfp_t gfpmask)
936{
937 struct bio *bi = *bio_src;
938 unsigned int off = *offset;
939 struct bio *chain = NULL;
940 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700941
Alex Elderf7760da2012-10-20 22:17:27 -0500942 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700943
Alex Elderf7760da2012-10-20 22:17:27 -0500944 if (!bi || off >= bi->bi_size || !len)
945 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700946
Alex Elderf7760da2012-10-20 22:17:27 -0500947 end = &chain;
948 while (len) {
949 unsigned int bi_size;
950 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700951
Alex Elderf7760da2012-10-20 22:17:27 -0500952 if (!bi)
953 goto out_err; /* EINVAL; ran out of bio's */
954 bi_size = min_t(unsigned int, bi->bi_size - off, len);
955 bio = bio_clone_range(bi, off, bi_size, gfpmask);
956 if (!bio)
957 goto out_err; /* ENOMEM */
958
959 *end = bio;
960 end = &bio->bi_next;
961
962 off += bi_size;
963 if (off == bi->bi_size) {
964 bi = bi->bi_next;
965 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966 }
Alex Elderf7760da2012-10-20 22:17:27 -0500967 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 }
Alex Elderf7760da2012-10-20 22:17:27 -0500969 *bio_src = bi;
970 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700971
Alex Elderf7760da2012-10-20 22:17:27 -0500972 return chain;
973out_err:
974 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700976 return NULL;
977}
978
979/*
980 * helpers for osd request op vectors.
981 */
Alex Elder57cfc102012-06-26 12:57:03 -0700982static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
983 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700984{
Alex Elder57cfc102012-06-26 12:57:03 -0700985 struct ceph_osd_req_op *ops;
986
987 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
988 if (!ops)
989 return NULL;
990
991 ops[0].op = opcode;
992
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700993 /*
994 * op extent offset and length will be set later on
995 * in calc_raw_layout()
996 */
Alex Elder57cfc102012-06-26 12:57:03 -0700997 ops[0].payload_len = payload_len;
998
999 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000}
1001
1002static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1003{
1004 kfree(ops);
1005}
1006
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001007static void rbd_coll_end_req_index(struct request *rq,
1008 struct rbd_req_coll *coll,
1009 int index,
1010 int ret, u64 len)
1011{
1012 struct request_queue *q;
1013 int min, max, i;
1014
Alex Elderbd919d42012-07-13 20:35:11 -05001015 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1016 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001017
1018 if (!rq)
1019 return;
1020
1021 if (!coll) {
1022 blk_end_request(rq, ret, len);
1023 return;
1024 }
1025
1026 q = rq->q;
1027
1028 spin_lock_irq(q->queue_lock);
1029 coll->status[index].done = 1;
1030 coll->status[index].rc = ret;
1031 coll->status[index].bytes = len;
1032 max = min = coll->num_done;
1033 while (max < coll->total && coll->status[max].done)
1034 max++;
1035
1036 for (i = min; i<max; i++) {
1037 __blk_end_request(rq, coll->status[i].rc,
1038 coll->status[i].bytes);
1039 coll->num_done++;
1040 kref_put(&coll->kref, rbd_coll_release);
1041 }
1042 spin_unlock_irq(q->queue_lock);
1043}
1044
1045static void rbd_coll_end_req(struct rbd_request *req,
1046 int ret, u64 len)
1047{
1048 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1049}
1050
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001051/*
1052 * Send ceph osd request
1053 */
1054static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001055 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056 struct ceph_snap_context *snapc,
1057 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001058 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001059 struct bio *bio,
1060 struct page **pages,
1061 int num_pages,
1062 int flags,
1063 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001064 struct rbd_req_coll *coll,
1065 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001066 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001067 struct ceph_msg *msg),
1068 struct ceph_osd_request **linger_req,
1069 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001070{
1071 struct ceph_osd_request *req;
1072 struct ceph_file_layout *layout;
1073 int ret;
1074 u64 bno;
1075 struct timespec mtime = CURRENT_TIME;
1076 struct rbd_request *req_data;
1077 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001078 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001079
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001081 if (!req_data) {
1082 if (coll)
1083 rbd_coll_end_req_index(rq, coll, coll_index,
1084 -ENOMEM, len);
1085 return -ENOMEM;
1086 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001087
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001088 if (coll) {
1089 req_data->coll = coll;
1090 req_data->coll_index = coll_index;
1091 }
1092
Alex Elderf7760da2012-10-20 22:17:27 -05001093 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1094 object_name, (unsigned long long) ofs,
1095 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001096
Alex Elder0ce1a792012-07-03 16:01:18 -05001097 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001098 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1099 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001100 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001101 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 goto done_pages;
1103 }
1104
1105 req->r_callback = rbd_cb;
1106
1107 req_data->rq = rq;
1108 req_data->bio = bio;
1109 req_data->pages = pages;
1110 req_data->len = len;
1111
1112 req->r_priv = req_data;
1113
1114 reqhead = req->r_request->front.iov_base;
1115 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1116
Alex Elderaded07e2012-07-03 16:01:18 -05001117 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001118 req->r_oid_len = strlen(req->r_oid);
1119
1120 layout = &req->r_file_layout;
1121 memset(layout, 0, sizeof(*layout));
1122 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1123 layout->fl_stripe_count = cpu_to_le32(1);
1124 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001125 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001126 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1127 req, ops);
1128 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129
1130 ceph_osdc_build_request(req, ofs, &len,
1131 ops,
1132 snapc,
1133 &mtime,
1134 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001136 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001137 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001138 *linger_req = req;
1139 }
1140
Alex Elder1dbb4392012-01-24 10:08:37 -06001141 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001142 if (ret < 0)
1143 goto done_err;
1144
1145 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001146 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001147 if (ver)
1148 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001149 dout("reassert_ver=%llu\n",
1150 (unsigned long long)
1151 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001152 ceph_osdc_put_request(req);
1153 }
1154 return ret;
1155
1156done_err:
1157 bio_chain_put(req_data->bio);
1158 ceph_osdc_put_request(req);
1159done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001160 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162 return ret;
1163}
1164
1165/*
1166 * Ceph osd op callback
1167 */
1168static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1169{
1170 struct rbd_request *req_data = req->r_priv;
1171 struct ceph_osd_reply_head *replyhead;
1172 struct ceph_osd_op *op;
1173 __s32 rc;
1174 u64 bytes;
1175 int read_op;
1176
1177 /* parse reply */
1178 replyhead = msg->front.iov_base;
1179 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1180 op = (void *)(replyhead + 1);
1181 rc = le32_to_cpu(replyhead->result);
1182 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001183 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001184
Alex Elderbd919d42012-07-13 20:35:11 -05001185 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1186 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001187
1188 if (rc == -ENOENT && read_op) {
1189 zero_bio_chain(req_data->bio, 0);
1190 rc = 0;
1191 } else if (rc == 0 && read_op && bytes < req_data->len) {
1192 zero_bio_chain(req_data->bio, bytes);
1193 bytes = req_data->len;
1194 }
1195
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001196 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001197
1198 if (req_data->bio)
1199 bio_chain_put(req_data->bio);
1200
1201 ceph_osdc_put_request(req);
1202 kfree(req_data);
1203}
1204
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001205static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1206{
1207 ceph_osdc_put_request(req);
1208}
1209
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001210/*
1211 * Do a synchronous ceph osd operation
1212 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001213static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001214 struct ceph_snap_context *snapc,
1215 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001216 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001217 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001218 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001219 u64 ofs, u64 inbound_size,
1220 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001221 struct ceph_osd_request **linger_req,
1222 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223{
1224 int ret;
1225 struct page **pages;
1226 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001227
Alex Elderaafb230e2012-09-06 16:00:54 -05001228 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001229
Alex Elderf8d4de62012-07-03 16:01:19 -05001230 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001232 if (IS_ERR(pages))
1233 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234
Alex Elder0ce1a792012-07-03 16:01:18 -05001235 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001236 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001237 pages, num_pages,
1238 flags,
1239 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001240 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001241 NULL,
1242 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001243 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001244 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001245
Alex Elderf8d4de62012-07-03 16:01:19 -05001246 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1247 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001248
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001249done:
1250 ceph_release_page_vector(pages, num_pages);
1251 return ret;
1252}
1253
1254/*
1255 * Do an asynchronous ceph osd operation
1256 */
1257static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001258 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001259 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001260 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001261 struct bio *bio,
1262 struct rbd_req_coll *coll,
1263 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001264{
1265 char *seg_name;
1266 u64 seg_ofs;
1267 u64 seg_len;
1268 int ret;
1269 struct ceph_osd_req_op *ops;
1270 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001271 int opcode;
1272 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001273 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001274
Alex Elder65ccfe22012-08-09 10:33:26 -07001275 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001276 if (!seg_name)
1277 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001278 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1279 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001280
Alex Elderff2e4bb2012-10-10 18:59:29 -07001281 if (rq_data_dir(rq) == WRITE) {
1282 opcode = CEPH_OSD_OP_WRITE;
1283 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001284 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001285 payload_len = seg_len;
1286 } else {
1287 opcode = CEPH_OSD_OP_READ;
1288 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001289 snapc = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001290 snapid = rbd_dev->spec->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001291 payload_len = 0;
1292 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001293
Alex Elder57cfc102012-06-26 12:57:03 -07001294 ret = -ENOMEM;
1295 ops = rbd_create_rw_ops(1, opcode, payload_len);
1296 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001297 goto done;
1298
1299 /* we've taken care of segment sizes earlier when we
1300 cloned the bios. We should never have a segment
1301 truncated at this point */
Alex Elderaafb230e2012-09-06 16:00:54 -05001302 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001303
1304 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1305 seg_name, seg_ofs, seg_len,
1306 bio,
1307 NULL, 0,
1308 flags,
1309 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001310 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001311 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001312
1313 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001314done:
1315 kfree(seg_name);
1316 return ret;
1317}
1318
1319/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001320 * Request sync osd read
1321 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001322static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001323 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001324 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001325 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001326 char *buf,
1327 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001328{
Alex Elder913d2fd2012-06-26 12:57:03 -07001329 struct ceph_osd_req_op *ops;
1330 int ret;
1331
1332 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1333 if (!ops)
1334 return -ENOMEM;
1335
1336 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001337 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001338 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001339 ops, object_name, ofs, len, buf, NULL, ver);
1340 rbd_destroy_ops(ops);
1341
1342 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001343}
1344
1345/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001346 * Request sync osd watch
1347 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001348static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001349 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001350 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001351{
1352 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001353 int ret;
1354
Alex Elder57cfc102012-06-26 12:57:03 -07001355 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1356 if (!ops)
1357 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001358
Josh Durgina71b8912011-12-05 18:10:44 -08001359 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001360 ops[0].watch.cookie = notify_id;
1361 ops[0].watch.flag = 0;
1362
Alex Elder0ce1a792012-07-03 16:01:18 -05001363 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001364 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001365 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001366 CEPH_OSD_FLAG_READ,
1367 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001368 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001369 rbd_simple_req_cb, 0, NULL);
1370
1371 rbd_destroy_ops(ops);
1372 return ret;
1373}
1374
1375static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1376{
Alex Elder0ce1a792012-07-03 16:01:18 -05001377 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001378 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001379 int rc;
1380
Alex Elder0ce1a792012-07-03 16:01:18 -05001381 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001382 return;
1383
Alex Elderbd919d42012-07-13 20:35:11 -05001384 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1385 rbd_dev->header_name, (unsigned long long) notify_id,
1386 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001387 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001388 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001389 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001390 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001391
Alex Elder7f0a24d2012-07-25 09:32:40 -05001392 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001393}
1394
1395/*
1396 * Request sync osd watch
1397 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001398static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001399{
1400 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001401 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001402 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001403
Alex Elder57cfc102012-06-26 12:57:03 -07001404 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1405 if (!ops)
1406 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001407
1408 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001409 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001410 if (ret < 0)
1411 goto fail;
1412
Alex Elder0e6f3222012-07-25 09:32:40 -05001413 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001414 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001415 ops[0].watch.flag = 1;
1416
Alex Elder0ce1a792012-07-03 16:01:18 -05001417 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001418 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001419 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1420 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001421 rbd_dev->header_name,
1422 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001423 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001424
1425 if (ret < 0)
1426 goto fail_event;
1427
1428 rbd_destroy_ops(ops);
1429 return 0;
1430
1431fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001432 ceph_osdc_cancel_event(rbd_dev->watch_event);
1433 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001434fail:
1435 rbd_destroy_ops(ops);
1436 return ret;
1437}
1438
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001439/*
1440 * Request sync osd unwatch
1441 */
Alex Elder070c6332012-07-25 09:32:41 -05001442static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001443{
1444 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001445 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001446
Alex Elder57cfc102012-06-26 12:57:03 -07001447 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1448 if (!ops)
1449 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001450
1451 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001452 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001453 ops[0].watch.flag = 0;
1454
Alex Elder0ce1a792012-07-03 16:01:18 -05001455 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001456 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001457 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1458 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001459 rbd_dev->header_name,
1460 0, 0, NULL, NULL, NULL);
1461
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001462
1463 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001464 ceph_osdc_cancel_event(rbd_dev->watch_event);
1465 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001466 return ret;
1467}
1468
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001469/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001470 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001471 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001472static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001473 const char *object_name,
1474 const char *class_name,
1475 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001476 const char *outbound,
1477 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001478 char *inbound,
1479 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001480 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001481 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001482{
1483 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001484 int class_name_len = strlen(class_name);
1485 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001486 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001487 int ret;
1488
Alex Elder3cb4a682012-06-26 12:57:03 -07001489 /*
1490 * Any input parameters required by the method we're calling
1491 * will be sent along with the class and method names as
1492 * part of the message payload. That data and its size are
1493 * supplied via the indata and indata_len fields (named from
1494 * the perspective of the server side) in the OSD request
1495 * operation.
1496 */
1497 payload_size = class_name_len + method_name_len + outbound_size;
1498 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001499 if (!ops)
1500 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001501
Alex Elderaded07e2012-07-03 16:01:18 -05001502 ops[0].cls.class_name = class_name;
1503 ops[0].cls.class_len = (__u8) class_name_len;
1504 ops[0].cls.method_name = method_name;
1505 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001506 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001507 ops[0].cls.indata = outbound;
1508 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001509
Alex Elder0ce1a792012-07-03 16:01:18 -05001510 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001512 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001513 object_name, 0, inbound_size, inbound,
1514 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001515
1516 rbd_destroy_ops(ops);
1517
1518 dout("cls_exec returned %d\n", ret);
1519 return ret;
1520}
1521
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001522static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1523{
1524 struct rbd_req_coll *coll =
1525 kzalloc(sizeof(struct rbd_req_coll) +
1526 sizeof(struct rbd_req_status) * num_reqs,
1527 GFP_ATOMIC);
1528
1529 if (!coll)
1530 return NULL;
1531 coll->total = num_reqs;
1532 kref_init(&coll->kref);
1533 return coll;
1534}
1535
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001536/*
1537 * block device queue callback
1538 */
1539static void rbd_rq_fn(struct request_queue *q)
1540{
1541 struct rbd_device *rbd_dev = q->queuedata;
1542 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543
Alex Elder00f1f362012-02-07 12:03:36 -06001544 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001545 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001546 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001547 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001549 int num_segs, cur_seg = 0;
1550 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001551 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001552 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001554 dout("fetched request\n");
1555
1556 /* filter out block requests we don't understand */
1557 if ((rq->cmd_type != REQ_TYPE_FS)) {
1558 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001559 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001560 }
1561
1562 /* deduce our operation (read, write) */
1563 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001564 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001565 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001566 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567 }
1568
1569 spin_unlock_irq(q->queue_lock);
1570
Josh Durgind1d25642011-12-05 14:03:05 -08001571 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001572
Alex Elderdaba5fd2012-10-26 17:25:23 -05001573 if (!rbd_dev->exists) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001574 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
Josh Durgine88a36e2011-11-21 18:14:25 -08001575 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001576 dout("request for non-existent snapshot");
1577 spin_lock_irq(q->queue_lock);
1578 __blk_end_request_all(rq, -ENXIO);
1579 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001580 }
1581
Josh Durgind1d25642011-12-05 14:03:05 -08001582 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1583
1584 up_read(&rbd_dev->header_rwsem);
1585
Alex Elderf7760da2012-10-20 22:17:27 -05001586 size = blk_rq_bytes(rq);
1587 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1588 bio = rq->bio;
1589
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590 dout("%s 0x%x bytes at 0x%llx\n",
1591 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001592 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001593
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001594 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001595 if (num_segs <= 0) {
1596 spin_lock_irq(q->queue_lock);
1597 __blk_end_request_all(rq, num_segs);
1598 ceph_put_snap_context(snapc);
1599 continue;
1600 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001601 coll = rbd_alloc_coll(num_segs);
1602 if (!coll) {
1603 spin_lock_irq(q->queue_lock);
1604 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001605 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001606 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001607 }
1608
Alex Elderf7760da2012-10-20 22:17:27 -05001609 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001610 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001611 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1612 unsigned int chain_size;
1613 struct bio *bio_chain;
1614
1615 BUG_ON(limit > (u64) UINT_MAX);
1616 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001617 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001618
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001619 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001620
1621 /* Pass a cloned bio chain via an osd request */
1622
1623 bio_chain = bio_chain_clone_range(&bio,
1624 &bio_offset, chain_size,
1625 GFP_ATOMIC);
1626 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001627 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001628 ofs, chain_size,
1629 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001630 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001631 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001632 -ENOMEM, chain_size);
1633 size -= chain_size;
1634 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001635
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001636 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001637 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001638 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001639
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001640 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001641
1642 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001643 }
1644}
1645
1646/*
1647 * a queue callback. Makes sure that we don't create a bio that spans across
1648 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001649 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001650 */
1651static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1652 struct bio_vec *bvec)
1653{
1654 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001655 sector_t sector_offset;
1656 sector_t sectors_per_obj;
1657 sector_t obj_sector_offset;
1658 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001659
Alex Eldere5cfeed22012-10-20 22:17:27 -05001660 /*
1661 * Find how far into its rbd object the partition-relative
1662 * bio start sector is to offset relative to the enclosing
1663 * device.
1664 */
1665 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1666 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1667 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001668
Alex Eldere5cfeed22012-10-20 22:17:27 -05001669 /*
1670 * Compute the number of bytes from that offset to the end
1671 * of the object. Account for what's already used by the bio.
1672 */
1673 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1674 if (ret > bmd->bi_size)
1675 ret -= bmd->bi_size;
1676 else
1677 ret = 0;
1678
1679 /*
1680 * Don't send back more than was asked for. And if the bio
1681 * was empty, let the whole thing through because: "Note
1682 * that a block device *must* allow a single page to be
1683 * added to an empty bio."
1684 */
1685 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1686 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1687 ret = (int) bvec->bv_len;
1688
1689 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001690}
1691
1692static void rbd_free_disk(struct rbd_device *rbd_dev)
1693{
1694 struct gendisk *disk = rbd_dev->disk;
1695
1696 if (!disk)
1697 return;
1698
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001699 if (disk->flags & GENHD_FL_UP)
1700 del_gendisk(disk);
1701 if (disk->queue)
1702 blk_cleanup_queue(disk->queue);
1703 put_disk(disk);
1704}
1705
1706/*
Alex Elder4156d992012-08-02 11:29:46 -05001707 * Read the complete header for the given rbd device.
1708 *
1709 * Returns a pointer to a dynamically-allocated buffer containing
1710 * the complete and validated header. Caller can pass the address
1711 * of a variable that will be filled in with the version of the
1712 * header object at the time it was read.
1713 *
1714 * Returns a pointer-coded errno if a failure occurs.
1715 */
1716static struct rbd_image_header_ondisk *
1717rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1718{
1719 struct rbd_image_header_ondisk *ondisk = NULL;
1720 u32 snap_count = 0;
1721 u64 names_size = 0;
1722 u32 want_count;
1723 int ret;
1724
1725 /*
1726 * The complete header will include an array of its 64-bit
1727 * snapshot ids, followed by the names of those snapshots as
1728 * a contiguous block of NUL-terminated strings. Note that
1729 * the number of snapshots could change by the time we read
1730 * it in, in which case we re-read it.
1731 */
1732 do {
1733 size_t size;
1734
1735 kfree(ondisk);
1736
1737 size = sizeof (*ondisk);
1738 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1739 size += names_size;
1740 ondisk = kmalloc(size, GFP_KERNEL);
1741 if (!ondisk)
1742 return ERR_PTR(-ENOMEM);
1743
1744 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1745 rbd_dev->header_name,
1746 0, size,
1747 (char *) ondisk, version);
1748
1749 if (ret < 0)
1750 goto out_err;
1751 if (WARN_ON((size_t) ret < size)) {
1752 ret = -ENXIO;
1753 pr_warning("short header read for image %s"
1754 " (want %zd got %d)\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001755 rbd_dev->spec->image_name, size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05001756 goto out_err;
1757 }
1758 if (!rbd_dev_ondisk_valid(ondisk)) {
1759 ret = -ENXIO;
1760 pr_warning("invalid header for image %s\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001761 rbd_dev->spec->image_name);
Alex Elder4156d992012-08-02 11:29:46 -05001762 goto out_err;
1763 }
1764
1765 names_size = le64_to_cpu(ondisk->snap_names_len);
1766 want_count = snap_count;
1767 snap_count = le32_to_cpu(ondisk->snap_count);
1768 } while (snap_count != want_count);
1769
1770 return ondisk;
1771
1772out_err:
1773 kfree(ondisk);
1774
1775 return ERR_PTR(ret);
1776}
1777
1778/*
1779 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780 */
1781static int rbd_read_header(struct rbd_device *rbd_dev,
1782 struct rbd_image_header *header)
1783{
Alex Elder4156d992012-08-02 11:29:46 -05001784 struct rbd_image_header_ondisk *ondisk;
1785 u64 ver = 0;
1786 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001787
Alex Elder4156d992012-08-02 11:29:46 -05001788 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1789 if (IS_ERR(ondisk))
1790 return PTR_ERR(ondisk);
1791 ret = rbd_header_from_disk(header, ondisk);
1792 if (ret >= 0)
1793 header->obj_version = ver;
1794 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001795
Alex Elder4156d992012-08-02 11:29:46 -05001796 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001797}
1798
Alex Elder41f38c22012-10-25 23:34:40 -05001799static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001800{
1801 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001802 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001803
Alex Eldera0593292012-07-19 09:09:27 -05001804 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001805 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001806}
1807
Alex Elder94785542012-10-09 13:50:17 -07001808static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1809{
1810 sector_t size;
1811
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001812 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001813 return;
1814
1815 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1816 dout("setting size to %llu sectors", (unsigned long long) size);
1817 rbd_dev->mapping.size = (u64) size;
1818 set_capacity(rbd_dev->disk, size);
1819}
1820
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001821/*
1822 * only read the first part of the ondisk header, without the snaps info
1823 */
Alex Elder117973f2012-08-31 17:29:55 -05001824static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001825{
1826 int ret;
1827 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001828
1829 ret = rbd_read_header(rbd_dev, &h);
1830 if (ret < 0)
1831 return ret;
1832
Josh Durgina51aa0c2011-12-05 10:35:04 -08001833 down_write(&rbd_dev->header_rwsem);
1834
Alex Elder94785542012-10-09 13:50:17 -07001835 /* Update image size, and check for resize of mapped image */
1836 rbd_dev->header.image_size = h.image_size;
1837 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001838
Alex Elder849b4262012-07-09 21:04:24 -05001839 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001841 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001842 /* osd requests may still refer to snapc */
1843 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001844
Alex Elderb8136232012-07-25 09:32:41 -05001845 if (hver)
1846 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001847 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001848 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001849 rbd_dev->header.snapc = h.snapc;
1850 rbd_dev->header.snap_names = h.snap_names;
1851 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001852 /* Free the extra copy of the object prefix */
1853 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1854 kfree(h.object_prefix);
1855
Alex Elder304f6802012-08-31 17:29:52 -05001856 ret = rbd_dev_snaps_update(rbd_dev);
1857 if (!ret)
1858 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001859
Josh Durginc6666012011-11-21 17:11:12 -08001860 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001861
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001862 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001863}
1864
Alex Elder117973f2012-08-31 17:29:55 -05001865static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001866{
1867 int ret;
1868
Alex Elder117973f2012-08-31 17:29:55 -05001869 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001870 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001871 if (rbd_dev->image_format == 1)
1872 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1873 else
1874 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001875 mutex_unlock(&ctl_mutex);
1876
1877 return ret;
1878}
1879
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001880static int rbd_init_disk(struct rbd_device *rbd_dev)
1881{
1882 struct gendisk *disk;
1883 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001884 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001885
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001886 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001887 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1888 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001889 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001890
Alex Elderf0f8cef2012-01-29 13:57:44 -06001891 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001892 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001893 disk->major = rbd_dev->major;
1894 disk->first_minor = 0;
1895 disk->fops = &rbd_bd_ops;
1896 disk->private_data = rbd_dev;
1897
1898 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001899 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1900 if (!q)
1901 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001902
Alex Elder593a9e72012-02-07 12:03:37 -06001903 /* We use the default size, but let's be explicit about it. */
1904 blk_queue_physical_block_size(q, SECTOR_SIZE);
1905
Josh Durgin029bcbd2011-07-22 11:35:23 -07001906 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001907 segment_size = rbd_obj_bytes(&rbd_dev->header);
1908 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1909 blk_queue_max_segment_size(q, segment_size);
1910 blk_queue_io_min(q, segment_size);
1911 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001912
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001913 blk_queue_merge_bvec(q, rbd_merge_bvec);
1914 disk->queue = q;
1915
1916 q->queuedata = rbd_dev;
1917
1918 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001919
Alex Elder12f02942012-08-29 17:11:07 -05001920 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1921
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001922 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001923out_disk:
1924 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001925
1926 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001927}
1928
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001929/*
1930 sysfs
1931*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001932
Alex Elder593a9e72012-02-07 12:03:37 -06001933static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1934{
1935 return container_of(dev, struct rbd_device, dev);
1936}
1937
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001938static ssize_t rbd_size_show(struct device *dev,
1939 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001940{
Alex Elder593a9e72012-02-07 12:03:37 -06001941 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001942 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001943
Josh Durgina51aa0c2011-12-05 10:35:04 -08001944 down_read(&rbd_dev->header_rwsem);
1945 size = get_capacity(rbd_dev->disk);
1946 up_read(&rbd_dev->header_rwsem);
1947
1948 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001949}
1950
Alex Elder34b13182012-07-13 20:35:12 -05001951/*
1952 * Note this shows the features for whatever's mapped, which is not
1953 * necessarily the base image.
1954 */
1955static ssize_t rbd_features_show(struct device *dev,
1956 struct device_attribute *attr, char *buf)
1957{
1958 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1959
1960 return sprintf(buf, "0x%016llx\n",
1961 (unsigned long long) rbd_dev->mapping.features);
1962}
1963
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001964static ssize_t rbd_major_show(struct device *dev,
1965 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001966{
Alex Elder593a9e72012-02-07 12:03:37 -06001967 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001968
1969 return sprintf(buf, "%d\n", rbd_dev->major);
1970}
1971
1972static ssize_t rbd_client_id_show(struct device *dev,
1973 struct device_attribute *attr, char *buf)
1974{
Alex Elder593a9e72012-02-07 12:03:37 -06001975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001976
Alex Elder1dbb4392012-01-24 10:08:37 -06001977 return sprintf(buf, "client%lld\n",
1978 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001979}
1980
1981static ssize_t rbd_pool_show(struct device *dev,
1982 struct device_attribute *attr, char *buf)
1983{
Alex Elder593a9e72012-02-07 12:03:37 -06001984 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001985
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001986 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001987}
1988
Alex Elder9bb2f332012-07-12 10:46:35 -05001989static ssize_t rbd_pool_id_show(struct device *dev,
1990 struct device_attribute *attr, char *buf)
1991{
1992 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1993
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001994 return sprintf(buf, "%llu\n",
1995 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05001996}
1997
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001998static ssize_t rbd_name_show(struct device *dev,
1999 struct device_attribute *attr, char *buf)
2000{
Alex Elder593a9e72012-02-07 12:03:37 -06002001 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002002
Alex Eldera92ffdf2012-10-30 19:40:33 -05002003 if (rbd_dev->spec->image_name)
2004 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2005
2006 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002007}
2008
Alex Elder589d30e2012-07-10 20:30:11 -05002009static ssize_t rbd_image_id_show(struct device *dev,
2010 struct device_attribute *attr, char *buf)
2011{
2012 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2013
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002014 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002015}
2016
Alex Elder34b13182012-07-13 20:35:12 -05002017/*
2018 * Shows the name of the currently-mapped snapshot (or
2019 * RBD_SNAP_HEAD_NAME for the base image).
2020 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002021static ssize_t rbd_snap_show(struct device *dev,
2022 struct device_attribute *attr,
2023 char *buf)
2024{
Alex Elder593a9e72012-02-07 12:03:37 -06002025 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002026
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002027 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002028}
2029
Alex Elder86b00e02012-10-25 23:34:42 -05002030/*
2031 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2032 * for the parent image. If there is no parent, simply shows
2033 * "(no parent image)".
2034 */
2035static ssize_t rbd_parent_show(struct device *dev,
2036 struct device_attribute *attr,
2037 char *buf)
2038{
2039 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2040 struct rbd_spec *spec = rbd_dev->parent_spec;
2041 int count;
2042 char *bufp = buf;
2043
2044 if (!spec)
2045 return sprintf(buf, "(no parent image)\n");
2046
2047 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2048 (unsigned long long) spec->pool_id, spec->pool_name);
2049 if (count < 0)
2050 return count;
2051 bufp += count;
2052
2053 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2054 spec->image_name ? spec->image_name : "(unknown)");
2055 if (count < 0)
2056 return count;
2057 bufp += count;
2058
2059 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2060 (unsigned long long) spec->snap_id, spec->snap_name);
2061 if (count < 0)
2062 return count;
2063 bufp += count;
2064
2065 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2066 if (count < 0)
2067 return count;
2068 bufp += count;
2069
2070 return (ssize_t) (bufp - buf);
2071}
2072
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002073static ssize_t rbd_image_refresh(struct device *dev,
2074 struct device_attribute *attr,
2075 const char *buf,
2076 size_t size)
2077{
Alex Elder593a9e72012-02-07 12:03:37 -06002078 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002079 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002080
Alex Elder117973f2012-08-31 17:29:55 -05002081 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002082
2083 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002084}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002085
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002086static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002087static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002088static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2089static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2090static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002091static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002092static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002093static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002094static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2095static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002096static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002097
2098static struct attribute *rbd_attrs[] = {
2099 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002100 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002101 &dev_attr_major.attr,
2102 &dev_attr_client_id.attr,
2103 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002104 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002106 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002107 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002108 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002109 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002110 NULL
2111};
2112
2113static struct attribute_group rbd_attr_group = {
2114 .attrs = rbd_attrs,
2115};
2116
2117static const struct attribute_group *rbd_attr_groups[] = {
2118 &rbd_attr_group,
2119 NULL
2120};
2121
2122static void rbd_sysfs_dev_release(struct device *dev)
2123{
2124}
2125
2126static struct device_type rbd_device_type = {
2127 .name = "rbd",
2128 .groups = rbd_attr_groups,
2129 .release = rbd_sysfs_dev_release,
2130};
2131
2132
2133/*
2134 sysfs - snapshots
2135*/
2136
2137static ssize_t rbd_snap_size_show(struct device *dev,
2138 struct device_attribute *attr,
2139 char *buf)
2140{
2141 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2142
Josh Durgin35915382011-12-05 18:25:13 -08002143 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002144}
2145
2146static ssize_t rbd_snap_id_show(struct device *dev,
2147 struct device_attribute *attr,
2148 char *buf)
2149{
2150 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2151
Josh Durgin35915382011-12-05 18:25:13 -08002152 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002153}
2154
Alex Elder34b13182012-07-13 20:35:12 -05002155static ssize_t rbd_snap_features_show(struct device *dev,
2156 struct device_attribute *attr,
2157 char *buf)
2158{
2159 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2160
2161 return sprintf(buf, "0x%016llx\n",
2162 (unsigned long long) snap->features);
2163}
2164
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002165static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2166static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002167static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002168
2169static struct attribute *rbd_snap_attrs[] = {
2170 &dev_attr_snap_size.attr,
2171 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002172 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002173 NULL,
2174};
2175
2176static struct attribute_group rbd_snap_attr_group = {
2177 .attrs = rbd_snap_attrs,
2178};
2179
2180static void rbd_snap_dev_release(struct device *dev)
2181{
2182 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2183 kfree(snap->name);
2184 kfree(snap);
2185}
2186
2187static const struct attribute_group *rbd_snap_attr_groups[] = {
2188 &rbd_snap_attr_group,
2189 NULL
2190};
2191
2192static struct device_type rbd_snap_device_type = {
2193 .groups = rbd_snap_attr_groups,
2194 .release = rbd_snap_dev_release,
2195};
2196
Alex Elder8b8fb992012-10-26 17:25:24 -05002197static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2198{
2199 kref_get(&spec->kref);
2200
2201 return spec;
2202}
2203
2204static void rbd_spec_free(struct kref *kref);
2205static void rbd_spec_put(struct rbd_spec *spec)
2206{
2207 if (spec)
2208 kref_put(&spec->kref, rbd_spec_free);
2209}
2210
2211static struct rbd_spec *rbd_spec_alloc(void)
2212{
2213 struct rbd_spec *spec;
2214
2215 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2216 if (!spec)
2217 return NULL;
2218 kref_init(&spec->kref);
2219
2220 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2221
2222 return spec;
2223}
2224
2225static void rbd_spec_free(struct kref *kref)
2226{
2227 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2228
2229 kfree(spec->pool_name);
2230 kfree(spec->image_id);
2231 kfree(spec->image_name);
2232 kfree(spec->snap_name);
2233 kfree(spec);
2234}
2235
Alex Elderc53d5892012-10-25 23:34:42 -05002236struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2237 struct rbd_spec *spec)
2238{
2239 struct rbd_device *rbd_dev;
2240
2241 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2242 if (!rbd_dev)
2243 return NULL;
2244
2245 spin_lock_init(&rbd_dev->lock);
2246 INIT_LIST_HEAD(&rbd_dev->node);
2247 INIT_LIST_HEAD(&rbd_dev->snaps);
2248 init_rwsem(&rbd_dev->header_rwsem);
2249
2250 rbd_dev->spec = spec;
2251 rbd_dev->rbd_client = rbdc;
2252
2253 return rbd_dev;
2254}
2255
2256static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2257{
Alex Elder86b00e02012-10-25 23:34:42 -05002258 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002259 kfree(rbd_dev->header_name);
2260 rbd_put_client(rbd_dev->rbd_client);
2261 rbd_spec_put(rbd_dev->spec);
2262 kfree(rbd_dev);
2263}
2264
Alex Elder304f6802012-08-31 17:29:52 -05002265static bool rbd_snap_registered(struct rbd_snap *snap)
2266{
2267 bool ret = snap->dev.type == &rbd_snap_device_type;
2268 bool reg = device_is_registered(&snap->dev);
2269
2270 rbd_assert(!ret ^ reg);
2271
2272 return ret;
2273}
2274
Alex Elder41f38c22012-10-25 23:34:40 -05002275static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002276{
2277 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002278 if (device_is_registered(&snap->dev))
2279 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002280}
2281
Alex Elder14e70852012-07-19 09:09:27 -05002282static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002283 struct device *parent)
2284{
2285 struct device *dev = &snap->dev;
2286 int ret;
2287
2288 dev->type = &rbd_snap_device_type;
2289 dev->parent = parent;
2290 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002291 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002292 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2293
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002294 ret = device_register(dev);
2295
2296 return ret;
2297}
2298
Alex Elder4e891e02012-07-10 20:30:10 -05002299static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002300 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002301 u64 snap_id, u64 snap_size,
2302 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002303{
Alex Elder4e891e02012-07-10 20:30:10 -05002304 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002305 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002306
2307 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002308 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002309 return ERR_PTR(-ENOMEM);
2310
2311 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002312 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002313 if (!snap->name)
2314 goto err;
2315
Alex Elderc8d18422012-07-10 20:30:11 -05002316 snap->id = snap_id;
2317 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002318 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002319
2320 return snap;
2321
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002322err:
2323 kfree(snap->name);
2324 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002325
2326 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002327}
2328
Alex Eldercd892122012-07-03 16:01:19 -05002329static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2330 u64 *snap_size, u64 *snap_features)
2331{
2332 char *snap_name;
2333
2334 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2335
2336 *snap_size = rbd_dev->header.snap_sizes[which];
2337 *snap_features = 0; /* No features for v1 */
2338
2339 /* Skip over names until we find the one we are looking for */
2340
2341 snap_name = rbd_dev->header.snap_names;
2342 while (which--)
2343 snap_name += strlen(snap_name) + 1;
2344
2345 return snap_name;
2346}
2347
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002348/*
Alex Elder9d475de2012-07-03 16:01:19 -05002349 * Get the size and object order for an image snapshot, or if
2350 * snap_id is CEPH_NOSNAP, gets this information for the base
2351 * image.
2352 */
2353static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2354 u8 *order, u64 *snap_size)
2355{
2356 __le64 snapid = cpu_to_le64(snap_id);
2357 int ret;
2358 struct {
2359 u8 order;
2360 __le64 size;
2361 } __attribute__ ((packed)) size_buf = { 0 };
2362
2363 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2364 "rbd", "get_size",
2365 (char *) &snapid, sizeof (snapid),
2366 (char *) &size_buf, sizeof (size_buf),
2367 CEPH_OSD_FLAG_READ, NULL);
2368 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2369 if (ret < 0)
2370 return ret;
2371
2372 *order = size_buf.order;
2373 *snap_size = le64_to_cpu(size_buf.size);
2374
2375 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2376 (unsigned long long) snap_id, (unsigned int) *order,
2377 (unsigned long long) *snap_size);
2378
2379 return 0;
2380}
2381
2382static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2383{
2384 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2385 &rbd_dev->header.obj_order,
2386 &rbd_dev->header.image_size);
2387}
2388
Alex Elder1e130192012-07-03 16:01:19 -05002389static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2390{
2391 void *reply_buf;
2392 int ret;
2393 void *p;
2394
2395 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2396 if (!reply_buf)
2397 return -ENOMEM;
2398
2399 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2400 "rbd", "get_object_prefix",
2401 NULL, 0,
2402 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2403 CEPH_OSD_FLAG_READ, NULL);
2404 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2405 if (ret < 0)
2406 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002407 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002408
2409 p = reply_buf;
2410 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2411 p + RBD_OBJ_PREFIX_LEN_MAX,
2412 NULL, GFP_NOIO);
2413
2414 if (IS_ERR(rbd_dev->header.object_prefix)) {
2415 ret = PTR_ERR(rbd_dev->header.object_prefix);
2416 rbd_dev->header.object_prefix = NULL;
2417 } else {
2418 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2419 }
2420
2421out:
2422 kfree(reply_buf);
2423
2424 return ret;
2425}
2426
Alex Elderb1b54022012-07-03 16:01:19 -05002427static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2428 u64 *snap_features)
2429{
2430 __le64 snapid = cpu_to_le64(snap_id);
2431 struct {
2432 __le64 features;
2433 __le64 incompat;
2434 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002435 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002436 int ret;
2437
2438 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2439 "rbd", "get_features",
2440 (char *) &snapid, sizeof (snapid),
2441 (char *) &features_buf, sizeof (features_buf),
2442 CEPH_OSD_FLAG_READ, NULL);
2443 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2444 if (ret < 0)
2445 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002446
2447 incompat = le64_to_cpu(features_buf.incompat);
2448 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002449 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002450
Alex Elderb1b54022012-07-03 16:01:19 -05002451 *snap_features = le64_to_cpu(features_buf.features);
2452
2453 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2454 (unsigned long long) snap_id,
2455 (unsigned long long) *snap_features,
2456 (unsigned long long) le64_to_cpu(features_buf.incompat));
2457
2458 return 0;
2459}
2460
2461static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2462{
2463 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2464 &rbd_dev->header.features);
2465}
2466
Alex Elder86b00e02012-10-25 23:34:42 -05002467static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2468{
2469 struct rbd_spec *parent_spec;
2470 size_t size;
2471 void *reply_buf = NULL;
2472 __le64 snapid;
2473 void *p;
2474 void *end;
2475 char *image_id;
2476 u64 overlap;
2477 size_t len = 0;
2478 int ret;
2479
2480 parent_spec = rbd_spec_alloc();
2481 if (!parent_spec)
2482 return -ENOMEM;
2483
2484 size = sizeof (__le64) + /* pool_id */
2485 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2486 sizeof (__le64) + /* snap_id */
2487 sizeof (__le64); /* overlap */
2488 reply_buf = kmalloc(size, GFP_KERNEL);
2489 if (!reply_buf) {
2490 ret = -ENOMEM;
2491 goto out_err;
2492 }
2493
2494 snapid = cpu_to_le64(CEPH_NOSNAP);
2495 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2496 "rbd", "get_parent",
2497 (char *) &snapid, sizeof (snapid),
2498 (char *) reply_buf, size,
2499 CEPH_OSD_FLAG_READ, NULL);
2500 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2501 if (ret < 0)
2502 goto out_err;
2503
2504 ret = -ERANGE;
2505 p = reply_buf;
2506 end = (char *) reply_buf + size;
2507 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2508 if (parent_spec->pool_id == CEPH_NOPOOL)
2509 goto out; /* No parent? No problem. */
2510
2511 image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2512 if (IS_ERR(image_id)) {
2513 ret = PTR_ERR(image_id);
2514 goto out_err;
2515 }
2516 parent_spec->image_id = image_id;
Alex Elder9e15b772012-10-30 19:40:33 -05002517 parent_spec->image_id_len = len;
Alex Elder86b00e02012-10-25 23:34:42 -05002518 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2519 ceph_decode_64_safe(&p, end, overlap, out_err);
2520
2521 rbd_dev->parent_overlap = overlap;
2522 rbd_dev->parent_spec = parent_spec;
2523 parent_spec = NULL; /* rbd_dev now owns this */
2524out:
2525 ret = 0;
2526out_err:
2527 kfree(reply_buf);
2528 rbd_spec_put(parent_spec);
2529
2530 return ret;
2531}
2532
Alex Elder9e15b772012-10-30 19:40:33 -05002533static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2534{
2535 size_t image_id_size;
2536 char *image_id;
2537 void *p;
2538 void *end;
2539 size_t size;
2540 void *reply_buf = NULL;
2541 size_t len = 0;
2542 char *image_name = NULL;
2543 int ret;
2544
2545 rbd_assert(!rbd_dev->spec->image_name);
2546
2547 image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
2548 image_id = kmalloc(image_id_size, GFP_KERNEL);
2549 if (!image_id)
2550 return NULL;
2551
2552 p = image_id;
2553 end = (char *) image_id + image_id_size;
2554 ceph_encode_string(&p, end, rbd_dev->spec->image_id,
2555 (u32) rbd_dev->spec->image_id_len);
2556
2557 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2558 reply_buf = kmalloc(size, GFP_KERNEL);
2559 if (!reply_buf)
2560 goto out;
2561
2562 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2563 "rbd", "dir_get_name",
2564 image_id, image_id_size,
2565 (char *) reply_buf, size,
2566 CEPH_OSD_FLAG_READ, NULL);
2567 if (ret < 0)
2568 goto out;
2569 p = reply_buf;
2570 end = (char *) reply_buf + size;
2571 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2572 if (IS_ERR(image_name))
2573 image_name = NULL;
2574 else
2575 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2576out:
2577 kfree(reply_buf);
2578 kfree(image_id);
2579
2580 return image_name;
2581}
2582
2583/*
2584 * When a parent image gets probed, we only have the pool, image,
2585 * and snapshot ids but not the names of any of them. This call
2586 * is made later to fill in those names. It has to be done after
2587 * rbd_dev_snaps_update() has completed because some of the
2588 * information (in particular, snapshot name) is not available
2589 * until then.
2590 */
2591static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2592{
2593 struct ceph_osd_client *osdc;
2594 const char *name;
2595 void *reply_buf = NULL;
2596 int ret;
2597
2598 if (rbd_dev->spec->pool_name)
2599 return 0; /* Already have the names */
2600
2601 /* Look up the pool name */
2602
2603 osdc = &rbd_dev->rbd_client->client->osdc;
2604 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2605 if (!name)
2606 return -EIO; /* pool id too large (>= 2^31) */
2607
2608 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2609 if (!rbd_dev->spec->pool_name)
2610 return -ENOMEM;
2611
2612 /* Fetch the image name; tolerate failure here */
2613
2614 name = rbd_dev_image_name(rbd_dev);
2615 if (name) {
2616 rbd_dev->spec->image_name_len = strlen(name);
2617 rbd_dev->spec->image_name = (char *) name;
2618 } else {
2619 pr_warning(RBD_DRV_NAME "%d "
2620 "unable to get image name for image id %s\n",
2621 rbd_dev->major, rbd_dev->spec->image_id);
2622 }
2623
2624 /* Look up the snapshot name. */
2625
2626 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2627 if (!name) {
2628 ret = -EIO;
2629 goto out_err;
2630 }
2631 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2632 if(!rbd_dev->spec->snap_name)
2633 goto out_err;
2634
2635 return 0;
2636out_err:
2637 kfree(reply_buf);
2638 kfree(rbd_dev->spec->pool_name);
2639 rbd_dev->spec->pool_name = NULL;
2640
2641 return ret;
2642}
2643
Alex Elder6e14b1a2012-07-03 16:01:19 -05002644static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002645{
2646 size_t size;
2647 int ret;
2648 void *reply_buf;
2649 void *p;
2650 void *end;
2651 u64 seq;
2652 u32 snap_count;
2653 struct ceph_snap_context *snapc;
2654 u32 i;
2655
2656 /*
2657 * We'll need room for the seq value (maximum snapshot id),
2658 * snapshot count, and array of that many snapshot ids.
2659 * For now we have a fixed upper limit on the number we're
2660 * prepared to receive.
2661 */
2662 size = sizeof (__le64) + sizeof (__le32) +
2663 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2664 reply_buf = kzalloc(size, GFP_KERNEL);
2665 if (!reply_buf)
2666 return -ENOMEM;
2667
2668 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2669 "rbd", "get_snapcontext",
2670 NULL, 0,
2671 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002672 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002673 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2674 if (ret < 0)
2675 goto out;
2676
2677 ret = -ERANGE;
2678 p = reply_buf;
2679 end = (char *) reply_buf + size;
2680 ceph_decode_64_safe(&p, end, seq, out);
2681 ceph_decode_32_safe(&p, end, snap_count, out);
2682
2683 /*
2684 * Make sure the reported number of snapshot ids wouldn't go
2685 * beyond the end of our buffer. But before checking that,
2686 * make sure the computed size of the snapshot context we
2687 * allocate is representable in a size_t.
2688 */
2689 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2690 / sizeof (u64)) {
2691 ret = -EINVAL;
2692 goto out;
2693 }
2694 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2695 goto out;
2696
2697 size = sizeof (struct ceph_snap_context) +
2698 snap_count * sizeof (snapc->snaps[0]);
2699 snapc = kmalloc(size, GFP_KERNEL);
2700 if (!snapc) {
2701 ret = -ENOMEM;
2702 goto out;
2703 }
2704
2705 atomic_set(&snapc->nref, 1);
2706 snapc->seq = seq;
2707 snapc->num_snaps = snap_count;
2708 for (i = 0; i < snap_count; i++)
2709 snapc->snaps[i] = ceph_decode_64(&p);
2710
2711 rbd_dev->header.snapc = snapc;
2712
2713 dout(" snap context seq = %llu, snap_count = %u\n",
2714 (unsigned long long) seq, (unsigned int) snap_count);
2715
2716out:
2717 kfree(reply_buf);
2718
2719 return 0;
2720}
2721
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002722static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2723{
2724 size_t size;
2725 void *reply_buf;
2726 __le64 snap_id;
2727 int ret;
2728 void *p;
2729 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002730 char *snap_name;
2731
2732 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2733 reply_buf = kmalloc(size, GFP_KERNEL);
2734 if (!reply_buf)
2735 return ERR_PTR(-ENOMEM);
2736
2737 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2738 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2739 "rbd", "get_snapshot_name",
2740 (char *) &snap_id, sizeof (snap_id),
2741 reply_buf, size,
2742 CEPH_OSD_FLAG_READ, NULL);
2743 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2744 if (ret < 0)
2745 goto out;
2746
2747 p = reply_buf;
2748 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002749 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002750 if (IS_ERR(snap_name)) {
2751 ret = PTR_ERR(snap_name);
2752 goto out;
2753 } else {
2754 dout(" snap_id 0x%016llx snap_name = %s\n",
2755 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2756 }
2757 kfree(reply_buf);
2758
2759 return snap_name;
2760out:
2761 kfree(reply_buf);
2762
2763 return ERR_PTR(ret);
2764}
2765
2766static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2767 u64 *snap_size, u64 *snap_features)
2768{
2769 __le64 snap_id;
2770 u8 order;
2771 int ret;
2772
2773 snap_id = rbd_dev->header.snapc->snaps[which];
2774 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2775 if (ret)
2776 return ERR_PTR(ret);
2777 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2778 if (ret)
2779 return ERR_PTR(ret);
2780
2781 return rbd_dev_v2_snap_name(rbd_dev, which);
2782}
2783
2784static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2785 u64 *snap_size, u64 *snap_features)
2786{
2787 if (rbd_dev->image_format == 1)
2788 return rbd_dev_v1_snap_info(rbd_dev, which,
2789 snap_size, snap_features);
2790 if (rbd_dev->image_format == 2)
2791 return rbd_dev_v2_snap_info(rbd_dev, which,
2792 snap_size, snap_features);
2793 return ERR_PTR(-EINVAL);
2794}
2795
Alex Elder117973f2012-08-31 17:29:55 -05002796static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2797{
2798 int ret;
2799 __u8 obj_order;
2800
2801 down_write(&rbd_dev->header_rwsem);
2802
2803 /* Grab old order first, to see if it changes */
2804
2805 obj_order = rbd_dev->header.obj_order,
2806 ret = rbd_dev_v2_image_size(rbd_dev);
2807 if (ret)
2808 goto out;
2809 if (rbd_dev->header.obj_order != obj_order) {
2810 ret = -EIO;
2811 goto out;
2812 }
2813 rbd_update_mapping_size(rbd_dev);
2814
2815 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2816 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2817 if (ret)
2818 goto out;
2819 ret = rbd_dev_snaps_update(rbd_dev);
2820 dout("rbd_dev_snaps_update returned %d\n", ret);
2821 if (ret)
2822 goto out;
2823 ret = rbd_dev_snaps_register(rbd_dev);
2824 dout("rbd_dev_snaps_register returned %d\n", ret);
2825out:
2826 up_write(&rbd_dev->header_rwsem);
2827
2828 return ret;
2829}
2830
Alex Elder9d475de2012-07-03 16:01:19 -05002831/*
Alex Elder35938152012-08-02 11:29:46 -05002832 * Scan the rbd device's current snapshot list and compare it to the
2833 * newly-received snapshot context. Remove any existing snapshots
2834 * not present in the new snapshot context. Add a new snapshot for
2835 * any snaphots in the snapshot context not in the current list.
2836 * And verify there are no changes to snapshots we already know
2837 * about.
2838 *
2839 * Assumes the snapshots in the snapshot context are sorted by
2840 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2841 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002842 */
Alex Elder304f6802012-08-31 17:29:52 -05002843static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002844{
Alex Elder35938152012-08-02 11:29:46 -05002845 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2846 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002847 struct list_head *head = &rbd_dev->snaps;
2848 struct list_head *links = head->next;
2849 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002850
Alex Elder9fcbb802012-08-23 23:48:49 -05002851 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002852 while (index < snap_count || links != head) {
2853 u64 snap_id;
2854 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002855 char *snap_name;
2856 u64 snap_size = 0;
2857 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002858
Alex Elder35938152012-08-02 11:29:46 -05002859 snap_id = index < snap_count ? snapc->snaps[index]
2860 : CEPH_NOSNAP;
2861 snap = links != head ? list_entry(links, struct rbd_snap, node)
2862 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05002863 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002864
Alex Elder35938152012-08-02 11:29:46 -05002865 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2866 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002867
Alex Elder35938152012-08-02 11:29:46 -05002868 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002869
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002870 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderdaba5fd2012-10-26 17:25:23 -05002871 rbd_dev->exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002872 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002873 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002874 rbd_dev->spec->snap_id == snap->id ?
2875 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002876 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002877
Alex Elder35938152012-08-02 11:29:46 -05002878 /* Done with this list entry; advance */
2879
2880 links = next;
2881 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002882 }
Alex Elder35938152012-08-02 11:29:46 -05002883
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002884 snap_name = rbd_dev_snap_info(rbd_dev, index,
2885 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002886 if (IS_ERR(snap_name))
2887 return PTR_ERR(snap_name);
2888
Alex Elder9fcbb802012-08-23 23:48:49 -05002889 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2890 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002891 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2892 struct rbd_snap *new_snap;
2893
2894 /* We haven't seen this snapshot before */
2895
Alex Elderc8d18422012-07-10 20:30:11 -05002896 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002897 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002898 if (IS_ERR(new_snap)) {
2899 int err = PTR_ERR(new_snap);
2900
2901 dout(" failed to add dev, error %d\n", err);
2902
2903 return err;
2904 }
Alex Elder35938152012-08-02 11:29:46 -05002905
2906 /* New goes before existing, or at end of list */
2907
Alex Elder9fcbb802012-08-23 23:48:49 -05002908 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002909 if (snap)
2910 list_add_tail(&new_snap->node, &snap->node);
2911 else
Alex Elder523f3252012-08-30 00:16:37 -05002912 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002913 } else {
2914 /* Already have this one */
2915
Alex Elder9fcbb802012-08-23 23:48:49 -05002916 dout(" already present\n");
2917
Alex Eldercd892122012-07-03 16:01:19 -05002918 rbd_assert(snap->size == snap_size);
Alex Elderaafb230e2012-09-06 16:00:54 -05002919 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002920 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002921
2922 /* Done with this list entry; advance */
2923
2924 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002925 }
Alex Elder35938152012-08-02 11:29:46 -05002926
2927 /* Advance to the next entry in the snapshot context */
2928
2929 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002930 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002931 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002932
2933 return 0;
2934}
2935
Alex Elder304f6802012-08-31 17:29:52 -05002936/*
2937 * Scan the list of snapshots and register the devices for any that
2938 * have not already been registered.
2939 */
2940static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2941{
2942 struct rbd_snap *snap;
2943 int ret = 0;
2944
2945 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002946 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2947 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002948
2949 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2950 if (!rbd_snap_registered(snap)) {
2951 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2952 if (ret < 0)
2953 break;
2954 }
2955 }
2956 dout("%s: returning %d\n", __func__, ret);
2957
2958 return ret;
2959}
2960
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002961static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2962{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002963 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002964 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002965
2966 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002967
Alex Eldercd789ab2012-08-30 00:16:38 -05002968 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002969 dev->bus = &rbd_bus_type;
2970 dev->type = &rbd_device_type;
2971 dev->parent = &rbd_root_dev;
2972 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002973 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002974 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002975
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002976 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002977
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002978 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002979}
2980
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002981static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2982{
2983 device_unregister(&rbd_dev->dev);
2984}
2985
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002986static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2987{
2988 int ret, rc;
2989
2990 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002991 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002992 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002993 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002994 if (rc < 0)
2995 return rc;
2996 }
2997 } while (ret == -ERANGE);
2998
2999 return ret;
3000}
3001
Alex Eldere2839302012-08-29 17:11:06 -05003002static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003003
3004/*
Alex Elder499afd52012-02-02 08:13:29 -06003005 * Get a unique rbd identifier for the given new rbd_dev, and add
3006 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003007 */
Alex Eldere2839302012-08-29 17:11:06 -05003008static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003009{
Alex Eldere2839302012-08-29 17:11:06 -05003010 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003011
3012 spin_lock(&rbd_dev_list_lock);
3013 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3014 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003015 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3016 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003017}
Alex Elderb7f23c32012-01-29 13:57:43 -06003018
Alex Elder1ddbe942012-01-29 13:57:44 -06003019/*
Alex Elder499afd52012-02-02 08:13:29 -06003020 * Remove an rbd_dev from the global list, and record that its
3021 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003022 */
Alex Eldere2839302012-08-29 17:11:06 -05003023static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003024{
Alex Elderd184f6b2012-01-29 13:57:44 -06003025 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003026 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003027 int max_id;
3028
Alex Elderaafb230e2012-09-06 16:00:54 -05003029 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003030
Alex Eldere2839302012-08-29 17:11:06 -05003031 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3032 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003033 spin_lock(&rbd_dev_list_lock);
3034 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003035
3036 /*
3037 * If the id being "put" is not the current maximum, there
3038 * is nothing special we need to do.
3039 */
Alex Eldere2839302012-08-29 17:11:06 -05003040 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003041 spin_unlock(&rbd_dev_list_lock);
3042 return;
3043 }
3044
3045 /*
3046 * We need to update the current maximum id. Search the
3047 * list to find out what it is. We're more likely to find
3048 * the maximum at the end, so search the list backward.
3049 */
3050 max_id = 0;
3051 list_for_each_prev(tmp, &rbd_dev_list) {
3052 struct rbd_device *rbd_dev;
3053
3054 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003055 if (rbd_dev->dev_id > max_id)
3056 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003057 }
Alex Elder499afd52012-02-02 08:13:29 -06003058 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003059
Alex Elder1ddbe942012-01-29 13:57:44 -06003060 /*
Alex Eldere2839302012-08-29 17:11:06 -05003061 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003062 * which case it now accurately reflects the new maximum.
3063 * Be careful not to overwrite the maximum value in that
3064 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003065 */
Alex Eldere2839302012-08-29 17:11:06 -05003066 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3067 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003068}
3069
Alex Eldera725f65e2012-02-02 08:13:30 -06003070/*
Alex Eldere28fff262012-02-02 08:13:30 -06003071 * Skips over white space at *buf, and updates *buf to point to the
3072 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003073 * the token (string of non-white space characters) found. Note
3074 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003075 */
3076static inline size_t next_token(const char **buf)
3077{
3078 /*
3079 * These are the characters that produce nonzero for
3080 * isspace() in the "C" and "POSIX" locales.
3081 */
3082 const char *spaces = " \f\n\r\t\v";
3083
3084 *buf += strspn(*buf, spaces); /* Find start of token */
3085
3086 return strcspn(*buf, spaces); /* Return token length */
3087}
3088
3089/*
3090 * Finds the next token in *buf, and if the provided token buffer is
3091 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003092 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3093 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003094 *
3095 * Returns the length of the token found (not including the '\0').
3096 * Return value will be 0 if no token is found, and it will be >=
3097 * token_size if the token would not fit.
3098 *
Alex Elder593a9e72012-02-07 12:03:37 -06003099 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003100 * found token. Note that this occurs even if the token buffer is
3101 * too small to hold it.
3102 */
3103static inline size_t copy_token(const char **buf,
3104 char *token,
3105 size_t token_size)
3106{
3107 size_t len;
3108
3109 len = next_token(buf);
3110 if (len < token_size) {
3111 memcpy(token, *buf, len);
3112 *(token + len) = '\0';
3113 }
3114 *buf += len;
3115
3116 return len;
3117}
3118
3119/*
Alex Elderea3352f2012-07-09 21:04:23 -05003120 * Finds the next token in *buf, dynamically allocates a buffer big
3121 * enough to hold a copy of it, and copies the token into the new
3122 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3123 * that a duplicate buffer is created even for a zero-length token.
3124 *
3125 * Returns a pointer to the newly-allocated duplicate, or a null
3126 * pointer if memory for the duplicate was not available. If
3127 * the lenp argument is a non-null pointer, the length of the token
3128 * (not including the '\0') is returned in *lenp.
3129 *
3130 * If successful, the *buf pointer will be updated to point beyond
3131 * the end of the found token.
3132 *
3133 * Note: uses GFP_KERNEL for allocation.
3134 */
3135static inline char *dup_token(const char **buf, size_t *lenp)
3136{
3137 char *dup;
3138 size_t len;
3139
3140 len = next_token(buf);
3141 dup = kmalloc(len + 1, GFP_KERNEL);
3142 if (!dup)
3143 return NULL;
3144
3145 memcpy(dup, *buf, len);
3146 *(dup + len) = '\0';
3147 *buf += len;
3148
3149 if (lenp)
3150 *lenp = len;
3151
3152 return dup;
3153}
3154
3155/*
Alex Elder859c31d2012-10-25 23:34:42 -05003156 * Parse the options provided for an "rbd add" (i.e., rbd image
3157 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3158 * and the data written is passed here via a NUL-terminated buffer.
3159 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003160 *
Alex Elder859c31d2012-10-25 23:34:42 -05003161 * The information extracted from these options is recorded in
3162 * the other parameters which return dynamically-allocated
3163 * structures:
3164 * ceph_opts
3165 * The address of a pointer that will refer to a ceph options
3166 * structure. Caller must release the returned pointer using
3167 * ceph_destroy_options() when it is no longer needed.
3168 * rbd_opts
3169 * Address of an rbd options pointer. Fully initialized by
3170 * this function; caller must release with kfree().
3171 * spec
3172 * Address of an rbd image specification pointer. Fully
3173 * initialized by this function based on parsed options.
3174 * Caller must release with rbd_spec_put().
3175 *
3176 * The options passed take this form:
3177 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3178 * where:
3179 * <mon_addrs>
3180 * A comma-separated list of one or more monitor addresses.
3181 * A monitor address is an ip address, optionally followed
3182 * by a port number (separated by a colon).
3183 * I.e.: ip1[:port1][,ip2[:port2]...]
3184 * <options>
3185 * A comma-separated list of ceph and/or rbd options.
3186 * <pool_name>
3187 * The name of the rados pool containing the rbd image.
3188 * <image_name>
3189 * The name of the image in that pool to map.
3190 * <snap_id>
3191 * An optional snapshot id. If provided, the mapping will
3192 * present data from the image at the time that snapshot was
3193 * created. The image head is used if no snapshot id is
3194 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003195 */
Alex Elder859c31d2012-10-25 23:34:42 -05003196static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003197 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003198 struct rbd_options **opts,
3199 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003200{
Alex Elderd22f76e2012-07-12 10:46:35 -05003201 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003202 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003203 const char *mon_addrs;
3204 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003205 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003206 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003207 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003208 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003209
3210 /* The first four tokens are required */
3211
Alex Elder7ef32142012-02-02 08:13:30 -06003212 len = next_token(&buf);
3213 if (!len)
Alex Elderdc79b112012-10-25 23:34:41 -05003214 return -EINVAL; /* Missing monitor address(es) */
Alex Elder0ddebc02012-10-25 23:34:41 -05003215 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003216 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003217 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003218
Alex Elderdc79b112012-10-25 23:34:41 -05003219 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003220 options = dup_token(&buf, NULL);
3221 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003222 return -ENOMEM;
Alex Elderf28e5652012-10-25 23:34:41 -05003223 if (!*options)
3224 goto out_err; /* Missing options */
Alex Eldera725f65e2012-02-02 08:13:30 -06003225
Alex Elder859c31d2012-10-25 23:34:42 -05003226 spec = rbd_spec_alloc();
3227 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003228 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003229
3230 spec->pool_name = dup_token(&buf, NULL);
3231 if (!spec->pool_name)
3232 goto out_mem;
3233 if (!*spec->pool_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003234 goto out_err; /* Missing pool name */
Alex Eldere28fff262012-02-02 08:13:30 -06003235
Alex Elder859c31d2012-10-25 23:34:42 -05003236 spec->image_name = dup_token(&buf, &spec->image_name_len);
3237 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003238 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003239 if (!*spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003240 goto out_err; /* Missing image name */
Alex Eldere28fff262012-02-02 08:13:30 -06003241
Alex Elderf28e5652012-10-25 23:34:41 -05003242 /*
3243 * Snapshot name is optional; default is to use "-"
3244 * (indicating the head/no snapshot).
3245 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003246 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003247 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003248 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3249 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003250 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003251 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003252 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003253 }
Alex Elder859c31d2012-10-25 23:34:42 -05003254 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3255 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003256 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003257 memcpy(spec->snap_name, buf, len);
3258 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003259
Alex Elder0ddebc02012-10-25 23:34:41 -05003260 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003261
Alex Elder4e9afeb2012-10-25 23:34:41 -05003262 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3263 if (!rbd_opts)
3264 goto out_mem;
3265
3266 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003267
Alex Elder859c31d2012-10-25 23:34:42 -05003268 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003269 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003270 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003271 if (IS_ERR(copts)) {
3272 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003273 goto out_err;
3274 }
Alex Elder859c31d2012-10-25 23:34:42 -05003275 kfree(options);
3276
3277 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003278 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003279 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003280
Alex Elderdc79b112012-10-25 23:34:41 -05003281 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003282out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003283 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003284out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003285 kfree(rbd_opts);
3286 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003287 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003288
Alex Elderdc79b112012-10-25 23:34:41 -05003289 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003290}
3291
Alex Elder589d30e2012-07-10 20:30:11 -05003292/*
3293 * An rbd format 2 image has a unique identifier, distinct from the
3294 * name given to it by the user. Internally, that identifier is
3295 * what's used to specify the names of objects related to the image.
3296 *
3297 * A special "rbd id" object is used to map an rbd image name to its
3298 * id. If that object doesn't exist, then there is no v2 rbd image
3299 * with the supplied name.
3300 *
3301 * This function will record the given rbd_dev's image_id field if
3302 * it can be determined, and in that case will return 0. If any
3303 * errors occur a negative errno will be returned and the rbd_dev's
3304 * image_id field will be unchanged (and should be NULL).
3305 */
3306static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3307{
3308 int ret;
3309 size_t size;
3310 char *object_name;
3311 void *response;
3312 void *p;
3313
3314 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003315 * When probing a parent image, the image id is already
3316 * known (and the image name likely is not). There's no
3317 * need to fetch the image id again in this case.
3318 */
3319 if (rbd_dev->spec->image_id)
3320 return 0;
3321
3322 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003323 * First, see if the format 2 image id file exists, and if
3324 * so, get the image's persistent id from it.
3325 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003326 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
Alex Elder589d30e2012-07-10 20:30:11 -05003327 object_name = kmalloc(size, GFP_NOIO);
3328 if (!object_name)
3329 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003330 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003331 dout("rbd id object name is %s\n", object_name);
3332
3333 /* Response will be an encoded string, which includes a length */
3334
3335 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3336 response = kzalloc(size, GFP_NOIO);
3337 if (!response) {
3338 ret = -ENOMEM;
3339 goto out;
3340 }
3341
3342 ret = rbd_req_sync_exec(rbd_dev, object_name,
3343 "rbd", "get_id",
3344 NULL, 0,
3345 response, RBD_IMAGE_ID_LEN_MAX,
3346 CEPH_OSD_FLAG_READ, NULL);
3347 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3348 if (ret < 0)
3349 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003350 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003351
3352 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003353 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003354 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003355 &rbd_dev->spec->image_id_len,
Alex Elder589d30e2012-07-10 20:30:11 -05003356 GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003357 if (IS_ERR(rbd_dev->spec->image_id)) {
3358 ret = PTR_ERR(rbd_dev->spec->image_id);
3359 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003360 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003361 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003362 }
3363out:
3364 kfree(response);
3365 kfree(object_name);
3366
3367 return ret;
3368}
3369
Alex Eldera30b71b2012-07-10 20:30:11 -05003370static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3371{
3372 int ret;
3373 size_t size;
3374
3375 /* Version 1 images have no id; empty string is used */
3376
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003377 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3378 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003379 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003380 rbd_dev->spec->image_id_len = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003381
3382 /* Record the header object name for this rbd image. */
3383
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003384 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003385 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3386 if (!rbd_dev->header_name) {
3387 ret = -ENOMEM;
3388 goto out_err;
3389 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003390 sprintf(rbd_dev->header_name, "%s%s",
3391 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003392
3393 /* Populate rbd image metadata */
3394
3395 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3396 if (ret < 0)
3397 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003398
3399 /* Version 1 images have no parent (no layering) */
3400
3401 rbd_dev->parent_spec = NULL;
3402 rbd_dev->parent_overlap = 0;
3403
Alex Eldera30b71b2012-07-10 20:30:11 -05003404 rbd_dev->image_format = 1;
3405
3406 dout("discovered version 1 image, header name is %s\n",
3407 rbd_dev->header_name);
3408
3409 return 0;
3410
3411out_err:
3412 kfree(rbd_dev->header_name);
3413 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003414 kfree(rbd_dev->spec->image_id);
3415 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003416
3417 return ret;
3418}
3419
3420static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3421{
3422 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003423 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003424 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003425
3426 /*
3427 * Image id was filled in by the caller. Record the header
3428 * object name for this rbd image.
3429 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003430 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
Alex Eldera30b71b2012-07-10 20:30:11 -05003431 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3432 if (!rbd_dev->header_name)
3433 return -ENOMEM;
3434 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003435 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003436
3437 /* Get the size and object order for the image */
3438
3439 ret = rbd_dev_v2_image_size(rbd_dev);
3440 if (ret < 0)
3441 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003442
3443 /* Get the object prefix (a.k.a. block_name) for the image */
3444
3445 ret = rbd_dev_v2_object_prefix(rbd_dev);
3446 if (ret < 0)
3447 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003448
Alex Elderd8891402012-10-09 13:50:17 -07003449 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003450
3451 ret = rbd_dev_v2_features(rbd_dev);
3452 if (ret < 0)
3453 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003454
Alex Elder86b00e02012-10-25 23:34:42 -05003455 /* If the image supports layering, get the parent info */
3456
3457 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3458 ret = rbd_dev_v2_parent_info(rbd_dev);
3459 if (ret < 0)
3460 goto out_err;
3461 }
3462
Alex Elder6e14b1a2012-07-03 16:01:19 -05003463 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003464
Alex Elder6e14b1a2012-07-03 16:01:19 -05003465 rbd_dev->header.crypt_type = 0;
3466 rbd_dev->header.comp_type = 0;
3467
3468 /* Get the snapshot context, plus the header version */
3469
3470 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003471 if (ret)
3472 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003473 rbd_dev->header.obj_version = ver;
3474
Alex Eldera30b71b2012-07-10 20:30:11 -05003475 rbd_dev->image_format = 2;
3476
3477 dout("discovered version 2 image, header name is %s\n",
3478 rbd_dev->header_name);
3479
Alex Elder35152972012-08-31 17:29:55 -05003480 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003481out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003482 rbd_dev->parent_overlap = 0;
3483 rbd_spec_put(rbd_dev->parent_spec);
3484 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003485 kfree(rbd_dev->header_name);
3486 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003487 kfree(rbd_dev->header.object_prefix);
3488 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003489
3490 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003491}
3492
Alex Elder83a06262012-10-30 15:47:17 -05003493static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3494{
3495 int ret;
3496
3497 /* no need to lock here, as rbd_dev is not registered yet */
3498 ret = rbd_dev_snaps_update(rbd_dev);
3499 if (ret)
3500 return ret;
3501
Alex Elder9e15b772012-10-30 19:40:33 -05003502 ret = rbd_dev_probe_update_spec(rbd_dev);
3503 if (ret)
3504 goto err_out_snaps;
3505
Alex Elder83a06262012-10-30 15:47:17 -05003506 ret = rbd_dev_set_mapping(rbd_dev);
3507 if (ret)
3508 goto err_out_snaps;
3509
3510 /* generate unique id: find highest unique id, add one */
3511 rbd_dev_id_get(rbd_dev);
3512
3513 /* Fill in the device name, now that we have its id. */
3514 BUILD_BUG_ON(DEV_NAME_LEN
3515 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3516 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3517
3518 /* Get our block major device number. */
3519
3520 ret = register_blkdev(0, rbd_dev->name);
3521 if (ret < 0)
3522 goto err_out_id;
3523 rbd_dev->major = ret;
3524
3525 /* Set up the blkdev mapping. */
3526
3527 ret = rbd_init_disk(rbd_dev);
3528 if (ret)
3529 goto err_out_blkdev;
3530
3531 ret = rbd_bus_add_dev(rbd_dev);
3532 if (ret)
3533 goto err_out_disk;
3534
3535 /*
3536 * At this point cleanup in the event of an error is the job
3537 * of the sysfs code (initiated by rbd_bus_del_dev()).
3538 */
3539 down_write(&rbd_dev->header_rwsem);
3540 ret = rbd_dev_snaps_register(rbd_dev);
3541 up_write(&rbd_dev->header_rwsem);
3542 if (ret)
3543 goto err_out_bus;
3544
3545 ret = rbd_init_watch_dev(rbd_dev);
3546 if (ret)
3547 goto err_out_bus;
3548
3549 /* Everything's ready. Announce the disk to the world. */
3550
3551 add_disk(rbd_dev->disk);
3552
3553 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3554 (unsigned long long) rbd_dev->mapping.size);
3555
3556 return ret;
3557err_out_bus:
3558 /* this will also clean up rest of rbd_dev stuff */
3559
3560 rbd_bus_del_dev(rbd_dev);
3561
3562 return ret;
3563err_out_disk:
3564 rbd_free_disk(rbd_dev);
3565err_out_blkdev:
3566 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3567err_out_id:
3568 rbd_dev_id_put(rbd_dev);
3569err_out_snaps:
3570 rbd_remove_all_snaps(rbd_dev);
3571
3572 return ret;
3573}
3574
Alex Eldera30b71b2012-07-10 20:30:11 -05003575/*
3576 * Probe for the existence of the header object for the given rbd
3577 * device. For format 2 images this includes determining the image
3578 * id.
3579 */
3580static int rbd_dev_probe(struct rbd_device *rbd_dev)
3581{
3582 int ret;
3583
3584 /*
3585 * Get the id from the image id object. If it's not a
3586 * format 2 image, we'll get ENOENT back, and we'll assume
3587 * it's a format 1 image.
3588 */
3589 ret = rbd_dev_image_id(rbd_dev);
3590 if (ret)
3591 ret = rbd_dev_v1_probe(rbd_dev);
3592 else
3593 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003594 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003595 dout("probe failed, returning %d\n", ret);
3596
Alex Elder83a06262012-10-30 15:47:17 -05003597 return ret;
3598 }
3599
3600 ret = rbd_dev_probe_finish(rbd_dev);
3601 if (ret)
3602 rbd_header_free(&rbd_dev->header);
3603
Alex Eldera30b71b2012-07-10 20:30:11 -05003604 return ret;
3605}
3606
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003607static ssize_t rbd_add(struct bus_type *bus,
3608 const char *buf,
3609 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003610{
Alex Eldercb8627c2012-07-09 21:04:23 -05003611 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003612 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003613 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003614 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003615 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003616 struct ceph_osd_client *osdc;
3617 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003618
3619 if (!try_module_get(THIS_MODULE))
3620 return -ENODEV;
3621
Alex Eldera725f65e2012-02-02 08:13:30 -06003622 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003623 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003624 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003625 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003626
Alex Elder9d3997f2012-10-25 23:34:42 -05003627 rbdc = rbd_get_client(ceph_opts);
3628 if (IS_ERR(rbdc)) {
3629 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003630 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003631 }
Alex Elderc53d5892012-10-25 23:34:42 -05003632 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003633
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003634 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003635 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003636 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003637 if (rc < 0)
3638 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003639 spec->pool_id = (u64) rc;
3640
Alex Elderc53d5892012-10-25 23:34:42 -05003641 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003642 if (!rbd_dev)
3643 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05003644 rbdc = NULL; /* rbd_dev now owns this */
3645 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003646
Alex Elderbd4ba652012-10-25 23:34:42 -05003647 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05003648 kfree(rbd_opts);
3649 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05003650
Alex Eldera30b71b2012-07-10 20:30:11 -05003651 rc = rbd_dev_probe(rbd_dev);
3652 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05003653 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05003654
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003655 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05003656err_out_rbd_dev:
3657 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05003658err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05003659 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003660err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003661 if (ceph_opts)
3662 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003663 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003664 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003665err_out_module:
3666 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003667
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003668 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06003669
3670 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003671}
3672
Alex Elderde71a292012-07-03 16:01:19 -05003673static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003674{
3675 struct list_head *tmp;
3676 struct rbd_device *rbd_dev;
3677
Alex Eldere124a822012-01-29 13:57:44 -06003678 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003679 list_for_each(tmp, &rbd_dev_list) {
3680 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003681 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003682 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003683 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003684 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003685 }
Alex Eldere124a822012-01-29 13:57:44 -06003686 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003687 return NULL;
3688}
3689
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003690static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003691{
Alex Elder593a9e72012-02-07 12:03:37 -06003692 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003693
Alex Elder1dbb4392012-01-24 10:08:37 -06003694 if (rbd_dev->watch_request) {
3695 struct ceph_client *client = rbd_dev->rbd_client->client;
3696
3697 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003698 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003699 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003700 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003701 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003702
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003703
3704 /* clean up and free blkdev */
3705 rbd_free_disk(rbd_dev);
3706 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003707
Alex Elder2ac4e752012-07-10 20:30:10 -05003708 /* release allocated disk header fields */
3709 rbd_header_free(&rbd_dev->header);
3710
Alex Elder32eec682012-02-08 16:11:14 -06003711 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05003712 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05003713 rbd_assert(rbd_dev->rbd_client != NULL);
3714 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003715
3716 /* release module ref */
3717 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003718}
3719
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003720static ssize_t rbd_remove(struct bus_type *bus,
3721 const char *buf,
3722 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003723{
3724 struct rbd_device *rbd_dev = NULL;
3725 int target_id, rc;
3726 unsigned long ul;
3727 int ret = count;
3728
3729 rc = strict_strtoul(buf, 10, &ul);
3730 if (rc)
3731 return rc;
3732
3733 /* convert to int; abort if we lost anything in the conversion */
3734 target_id = (int) ul;
3735 if (target_id != ul)
3736 return -EINVAL;
3737
3738 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3739
3740 rbd_dev = __rbd_get_dev(target_id);
3741 if (!rbd_dev) {
3742 ret = -ENOENT;
3743 goto done;
3744 }
3745
Alex Elder42382b72012-11-16 09:29:16 -06003746 if (rbd_dev->open_count) {
3747 ret = -EBUSY;
3748 goto done;
3749 }
3750
Alex Elder41f38c22012-10-25 23:34:40 -05003751 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003752 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003753
3754done:
3755 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05003756
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003757 return ret;
3758}
3759
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003760/*
3761 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003762 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003763 */
3764static int rbd_sysfs_init(void)
3765{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003766 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003767
Alex Elderfed4c142012-02-07 12:03:36 -06003768 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003769 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003770 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003771
Alex Elderfed4c142012-02-07 12:03:36 -06003772 ret = bus_register(&rbd_bus_type);
3773 if (ret < 0)
3774 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003775
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003776 return ret;
3777}
3778
3779static void rbd_sysfs_cleanup(void)
3780{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003781 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003782 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003783}
3784
3785int __init rbd_init(void)
3786{
3787 int rc;
3788
3789 rc = rbd_sysfs_init();
3790 if (rc)
3791 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003792 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003793 return 0;
3794}
3795
3796void __exit rbd_exit(void)
3797{
3798 rbd_sysfs_cleanup();
3799}
3800
3801module_init(rbd_init);
3802module_exit(rbd_exit);
3803
3804MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3805MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3806MODULE_DESCRIPTION("rados block device");
3807
3808/* following authorship retained from original osdblk.c */
3809MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3810
3811MODULE_LICENSE("GPL");