blob: fc1a045cee4d95e02bff324e749fe3e9a46573e6 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
Alex Elderbf0d5f502012-11-22 00:00:08 -0600165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
Alex Elder9969ebc2013-01-18 12:31:10 -0600173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600176
177struct rbd_obj_request {
178 const char *object_name;
179 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */
181
182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
199 s32 result;
200 atomic_t done;
201
202 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600203 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204
205 struct kref kref;
206};
207
208struct rbd_img_request {
209 struct request *rq;
210 struct rbd_device *rbd_dev;
211 u64 offset; /* starting image byte offset */
212 u64 length; /* byte count from offset */
213 bool write_request; /* false for read */
214 union {
215 struct ceph_snap_context *snapc; /* for writes */
216 u64 snap_id; /* for reads */
217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
226};
227
228#define for_each_obj_request(ireq, oreq) \
229 list_for_each_entry(oreq, &ireq->obj_requests, links)
230#define for_each_obj_request_from(ireq, oreq) \
231 list_for_each_entry_from(oreq, &ireq->obj_requests, links)
232#define for_each_obj_request_safe(ireq, oreq, n) \
233 list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235struct rbd_snap {
236 struct device dev;
237 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800238 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239 struct list_head node;
240 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500241 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800242};
243
Alex Elderf84344f2012-08-31 17:29:51 -0500244struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500245 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500246 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500247 bool read_only;
248};
249
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250/*
251 * a single device
252 */
253struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500254 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255
256 int major; /* blkdev assigned major */
257 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258
Alex Eldera30b71b2012-07-10 20:30:11 -0500259 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260 struct rbd_client *rbd_client;
261
262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263
264 spinlock_t lock; /* queue lock */
265
266 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600267 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500268 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500270 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500271
Alex Elder0903e872012-11-14 12:25:19 -0600272 struct ceph_file_layout layout;
273
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700274 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600275 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700276
Alex Elder86b00e02012-10-25 23:34:42 -0500277 struct rbd_spec *parent_spec;
278 u64 parent_overlap;
279
Josh Durginc6666012011-11-21 17:11:12 -0800280 /* protects updating the header */
281 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500282
283 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284
285 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800286
287 /* list of snapshots */
288 struct list_head snaps;
289
290 /* sysfs related */
291 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600292 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800293};
294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600296
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600298static DEFINE_SPINLOCK(rbd_dev_list_lock);
299
Alex Elder432b8582012-01-29 13:57:44 -0600300static LIST_HEAD(rbd_client_list); /* clients */
301static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302
Alex Elder304f6802012-08-31 17:29:52 -0500303static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
304static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
305
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800306static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500307static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800308
Alex Elderf0f8cef2012-01-29 13:57:44 -0600309static ssize_t rbd_add(struct bus_type *bus, const char *buf,
310 size_t count);
311static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
312 size_t count);
313
314static struct bus_attribute rbd_bus_attrs[] = {
315 __ATTR(add, S_IWUSR, NULL, rbd_add),
316 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
317 __ATTR_NULL
318};
319
320static struct bus_type rbd_bus_type = {
321 .name = "rbd",
322 .bus_attrs = rbd_bus_attrs,
323};
324
325static void rbd_root_dev_release(struct device *dev)
326{
327}
328
329static struct device rbd_root_dev = {
330 .init_name = "rbd",
331 .release = rbd_root_dev_release,
332};
333
Alex Elder06ecc6c2012-11-01 10:17:15 -0500334static __printf(2, 3)
335void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
336{
337 struct va_format vaf;
338 va_list args;
339
340 va_start(args, fmt);
341 vaf.fmt = fmt;
342 vaf.va = &args;
343
344 if (!rbd_dev)
345 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
346 else if (rbd_dev->disk)
347 printk(KERN_WARNING "%s: %s: %pV\n",
348 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
349 else if (rbd_dev->spec && rbd_dev->spec->image_name)
350 printk(KERN_WARNING "%s: image %s: %pV\n",
351 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
352 else if (rbd_dev->spec && rbd_dev->spec->image_id)
353 printk(KERN_WARNING "%s: id %s: %pV\n",
354 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
355 else /* punt */
356 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
357 RBD_DRV_NAME, rbd_dev, &vaf);
358 va_end(args);
359}
360
Alex Elderaafb230e2012-09-06 16:00:54 -0500361#ifdef RBD_DEBUG
362#define rbd_assert(expr) \
363 if (unlikely(!(expr))) { \
364 printk(KERN_ERR "\nAssertion failure in %s() " \
365 "at line %d:\n\n" \
366 "\trbd_assert(%s);\n\n", \
367 __func__, __LINE__, #expr); \
368 BUG(); \
369 }
370#else /* !RBD_DEBUG */
371# define rbd_assert(expr) ((void) 0)
372#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800373
Alex Elder117973f2012-08-31 17:29:55 -0500374static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
375static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700376
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377static int rbd_open(struct block_device *bdev, fmode_t mode)
378{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600379 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380
Alex Elderf84344f2012-08-31 17:29:51 -0500381 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382 return -EROFS;
383
Alex Elder42382b72012-11-16 09:29:16 -0600384 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600385 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500386 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600387 rbd_dev->open_count++;
388 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700389
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700390 return 0;
391}
392
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800393static int rbd_release(struct gendisk *disk, fmode_t mode)
394{
395 struct rbd_device *rbd_dev = disk->private_data;
396
Alex Elder42382b72012-11-16 09:29:16 -0600397 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
398 rbd_assert(rbd_dev->open_count > 0);
399 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600400 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600401 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800402
403 return 0;
404}
405
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406static const struct block_device_operations rbd_bd_ops = {
407 .owner = THIS_MODULE,
408 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800409 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410};
411
412/*
413 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500414 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700415 */
Alex Elderf8c38922012-08-10 13:12:07 -0700416static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417{
418 struct rbd_client *rbdc;
419 int ret = -ENOMEM;
420
421 dout("rbd_client_create\n");
422 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
423 if (!rbdc)
424 goto out_opt;
425
426 kref_init(&rbdc->kref);
427 INIT_LIST_HEAD(&rbdc->node);
428
Alex Elderbc534d82012-01-29 13:57:44 -0600429 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
430
Alex Elder43ae4702012-07-03 16:01:18 -0500431 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600433 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500434 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435
436 ret = ceph_open_session(rbdc->client);
437 if (ret < 0)
438 goto out_err;
439
Alex Elder432b8582012-01-29 13:57:44 -0600440 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600442 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443
Alex Elderbc534d82012-01-29 13:57:44 -0600444 mutex_unlock(&ctl_mutex);
445
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446 dout("rbd_client_create created %p\n", rbdc);
447 return rbdc;
448
449out_err:
450 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600451out_mutex:
452 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 kfree(rbdc);
454out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500455 if (ceph_opts)
456 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400457 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458}
459
460/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700461 * Find a ceph client with specific addr and configuration. If
462 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700464static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465{
466 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700467 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468
Alex Elder43ae4702012-07-03 16:01:18 -0500469 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470 return NULL;
471
Alex Elder1f7ba332012-08-10 13:12:07 -0700472 spin_lock(&rbd_client_list_lock);
473 list_for_each_entry(client_node, &rbd_client_list, node) {
474 if (!ceph_compare_options(ceph_opts, client_node->client)) {
475 kref_get(&client_node->kref);
476 found = true;
477 break;
478 }
479 }
480 spin_unlock(&rbd_client_list_lock);
481
482 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483}
484
485/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700486 * mount options
487 */
488enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700489 Opt_last_int,
490 /* int args above */
491 Opt_last_string,
492 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700493 Opt_read_only,
494 Opt_read_write,
495 /* Boolean args above */
496 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700497};
498
Alex Elder43ae4702012-07-03 16:01:18 -0500499static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700500 /* int args above */
501 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500502 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700503 {Opt_read_only, "ro"}, /* Alternate spelling */
504 {Opt_read_write, "read_write"},
505 {Opt_read_write, "rw"}, /* Alternate spelling */
506 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700507 {-1, NULL}
508};
509
Alex Elder98571b52013-01-20 14:44:42 -0600510struct rbd_options {
511 bool read_only;
512};
513
514#define RBD_READ_ONLY_DEFAULT false
515
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700516static int parse_rbd_opts_token(char *c, void *private)
517{
Alex Elder43ae4702012-07-03 16:01:18 -0500518 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700519 substring_t argstr[MAX_OPT_ARGS];
520 int token, intval, ret;
521
Alex Elder43ae4702012-07-03 16:01:18 -0500522 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700523 if (token < 0)
524 return -EINVAL;
525
526 if (token < Opt_last_int) {
527 ret = match_int(&argstr[0], &intval);
528 if (ret < 0) {
529 pr_err("bad mount option arg (not int) "
530 "at '%s'\n", c);
531 return ret;
532 }
533 dout("got int token %d val %d\n", token, intval);
534 } else if (token > Opt_last_int && token < Opt_last_string) {
535 dout("got string token %d val %s\n", token,
536 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700537 } else if (token > Opt_last_string && token < Opt_last_bool) {
538 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700539 } else {
540 dout("got token %d\n", token);
541 }
542
543 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700544 case Opt_read_only:
545 rbd_opts->read_only = true;
546 break;
547 case Opt_read_write:
548 rbd_opts->read_only = false;
549 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700550 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500551 rbd_assert(false);
552 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700553 }
554 return 0;
555}
556
557/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558 * Get a ceph client with specific addr and configuration, if one does
559 * not exist create it.
560 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500561static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562{
Alex Elderf8c38922012-08-10 13:12:07 -0700563 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700564
Alex Elder1f7ba332012-08-10 13:12:07 -0700565 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500566 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500567 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500568 else
Alex Elderf8c38922012-08-10 13:12:07 -0700569 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570
Alex Elder9d3997f2012-10-25 23:34:42 -0500571 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700572}
573
574/*
575 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600576 *
Alex Elder432b8582012-01-29 13:57:44 -0600577 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 */
579static void rbd_client_release(struct kref *kref)
580{
581 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
582
583 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500584 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500586 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587
588 ceph_destroy_client(rbdc->client);
589 kfree(rbdc);
590}
591
592/*
593 * Drop reference to ceph client node. If it's not referenced anymore, release
594 * it.
595 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500596static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597{
Alex Elderc53d5892012-10-25 23:34:42 -0500598 if (rbdc)
599 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600}
601
Alex Eldera30b71b2012-07-10 20:30:11 -0500602static bool rbd_image_format_valid(u32 image_format)
603{
604 return image_format == 1 || image_format == 2;
605}
606
Alex Elder8e94af82012-07-25 09:32:40 -0500607static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
608{
Alex Elder103a1502012-08-02 11:29:45 -0500609 size_t size;
610 u32 snap_count;
611
612 /* The header has to start with the magic rbd header text */
613 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
614 return false;
615
Alex Elderdb2388b2012-10-20 22:17:27 -0500616 /* The bio layer requires at least sector-sized I/O */
617
618 if (ondisk->options.order < SECTOR_SHIFT)
619 return false;
620
621 /* If we use u64 in a few spots we may be able to loosen this */
622
623 if (ondisk->options.order > 8 * sizeof (int) - 1)
624 return false;
625
Alex Elder103a1502012-08-02 11:29:45 -0500626 /*
627 * The size of a snapshot header has to fit in a size_t, and
628 * that limits the number of snapshots.
629 */
630 snap_count = le32_to_cpu(ondisk->snap_count);
631 size = SIZE_MAX - sizeof (struct ceph_snap_context);
632 if (snap_count > size / sizeof (__le64))
633 return false;
634
635 /*
636 * Not only that, but the size of the entire the snapshot
637 * header must also be representable in a size_t.
638 */
639 size -= snap_count * sizeof (__le64);
640 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
641 return false;
642
643 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500644}
645
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646/*
647 * Create a new header structure, translate header format from the on-disk
648 * header.
649 */
650static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500651 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652{
Alex Elderccece232012-07-10 20:30:10 -0500653 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500654 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500655 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500656 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657
Alex Elder6a523252012-07-19 17:12:59 -0500658 memset(header, 0, sizeof (*header));
659
Alex Elder103a1502012-08-02 11:29:45 -0500660 snap_count = le32_to_cpu(ondisk->snap_count);
661
Alex Elder58c17b02012-08-23 23:22:06 -0500662 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
663 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500664 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500666 memcpy(header->object_prefix, ondisk->object_prefix, len);
667 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600668
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500670 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
671
Alex Elder621901d2012-08-23 23:22:06 -0500672 /* Save a copy of the snapshot names */
673
Alex Elderf785cc12012-08-23 23:22:06 -0500674 if (snap_names_len > (u64) SIZE_MAX)
675 return -EIO;
676 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500678 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500679 /*
680 * Note that rbd_dev_v1_header_read() guarantees
681 * the ondisk buffer we're working with has
682 * snap_names_len bytes beyond the end of the
683 * snapshot id array, this memcpy() is safe.
684 */
685 memcpy(header->snap_names, &ondisk->snaps[snap_count],
686 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500687
Alex Elder621901d2012-08-23 23:22:06 -0500688 /* Record each snapshot's size */
689
Alex Elderd2bb24e2012-07-26 23:37:14 -0500690 size = snap_count * sizeof (*header->snap_sizes);
691 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500693 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500694 for (i = 0; i < snap_count; i++)
695 header->snap_sizes[i] =
696 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 } else {
Alex Elderccece232012-07-10 20:30:10 -0500698 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 header->snap_names = NULL;
700 header->snap_sizes = NULL;
701 }
Alex Elder849b4262012-07-09 21:04:24 -0500702
Alex Elder34b13182012-07-13 20:35:12 -0500703 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704 header->obj_order = ondisk->options.order;
705 header->crypt_type = ondisk->options.crypt_type;
706 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500707
Alex Elder621901d2012-08-23 23:22:06 -0500708 /* Allocate and fill in the snapshot context */
709
Alex Elderf84344f2012-08-31 17:29:51 -0500710 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500711 size = sizeof (struct ceph_snap_context);
712 size += snap_count * sizeof (header->snapc->snaps[0]);
713 header->snapc = kzalloc(size, GFP_KERNEL);
714 if (!header->snapc)
715 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
717 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500718 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500720 for (i = 0; i < snap_count; i++)
721 header->snapc->snaps[i] =
722 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723
724 return 0;
725
Alex Elder6a523252012-07-19 17:12:59 -0500726out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500727 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500728 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500730 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500731 kfree(header->object_prefix);
732 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500733
Alex Elder00f1f362012-02-07 12:03:36 -0600734 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735}
736
Alex Elder9e15b772012-10-30 19:40:33 -0500737static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
738{
739 struct rbd_snap *snap;
740
741 if (snap_id == CEPH_NOSNAP)
742 return RBD_SNAP_HEAD_NAME;
743
744 list_for_each_entry(snap, &rbd_dev->snaps, node)
745 if (snap_id == snap->id)
746 return snap->name;
747
748 return NULL;
749}
750
Alex Elder8836b992012-08-30 14:42:15 -0500751static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700752{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753
Alex Eldere86924a2012-07-10 20:30:11 -0500754 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600755
Alex Eldere86924a2012-07-10 20:30:11 -0500756 list_for_each_entry(snap, &rbd_dev->snaps, node) {
757 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500758 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500759 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500760 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600761
Alex Eldere86924a2012-07-10 20:30:11 -0500762 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600763 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700764 }
Alex Eldere86924a2012-07-10 20:30:11 -0500765
Alex Elder00f1f362012-02-07 12:03:36 -0600766 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767}
768
Alex Elder819d52b2012-10-25 23:34:41 -0500769static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770{
Alex Elder78dc4472012-07-19 08:49:18 -0500771 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700772
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500773 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800774 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500775 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500776 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500777 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500778 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500780 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700781 if (ret < 0)
782 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500783 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 }
Alex Elderd78b6502012-11-09 08:43:15 -0600785 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 return ret;
788}
789
790static void rbd_header_free(struct rbd_image_header *header)
791{
Alex Elder849b4262012-07-09 21:04:24 -0500792 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500793 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700794 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500795 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500796 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500797 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800798 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500799 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700800}
801
Alex Elder98571b52013-01-20 14:44:42 -0600802static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803{
Alex Elder65ccfe22012-08-09 10:33:26 -0700804 char *name;
805 u64 segment;
806 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807
Alex Elder2fd82b92012-11-09 15:05:54 -0600808 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700809 if (!name)
810 return NULL;
811 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600812 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700813 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600814 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700815 pr_err("error formatting segment name for #%llu (%d)\n",
816 segment, ret);
817 kfree(name);
818 name = NULL;
819 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820
Alex Elder65ccfe22012-08-09 10:33:26 -0700821 return name;
822}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700823
Alex Elder65ccfe22012-08-09 10:33:26 -0700824static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
825{
826 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700827
Alex Elder65ccfe22012-08-09 10:33:26 -0700828 return offset & (segment_size - 1);
829}
830
831static u64 rbd_segment_length(struct rbd_device *rbd_dev,
832 u64 offset, u64 length)
833{
834 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
835
836 offset &= segment_size - 1;
837
Alex Elderaafb230e2012-09-06 16:00:54 -0500838 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700839 if (offset + length > segment_size)
840 length = segment_size - offset;
841
842 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700843}
844
845/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700846 * returns the size of an object in the image
847 */
848static u64 rbd_obj_bytes(struct rbd_image_header *header)
849{
850 return 1 << header->obj_order;
851}
852
853/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854 * bio helpers
855 */
856
857static void bio_chain_put(struct bio *chain)
858{
859 struct bio *tmp;
860
861 while (chain) {
862 tmp = chain;
863 chain = chain->bi_next;
864 bio_put(tmp);
865 }
866}
867
868/*
869 * zeros a bio chain, starting at specific offset
870 */
871static void zero_bio_chain(struct bio *chain, int start_ofs)
872{
873 struct bio_vec *bv;
874 unsigned long flags;
875 void *buf;
876 int i;
877 int pos = 0;
878
879 while (chain) {
880 bio_for_each_segment(bv, chain, i) {
881 if (pos + bv->bv_len > start_ofs) {
882 int remainder = max(start_ofs - pos, 0);
883 buf = bvec_kmap_irq(bv, &flags);
884 memset(buf + remainder, 0,
885 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200886 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 }
888 pos += bv->bv_len;
889 }
890
891 chain = chain->bi_next;
892 }
893}
894
895/*
Alex Elderf7760da2012-10-20 22:17:27 -0500896 * Clone a portion of a bio, starting at the given byte offset
897 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898 */
Alex Elderf7760da2012-10-20 22:17:27 -0500899static struct bio *bio_clone_range(struct bio *bio_src,
900 unsigned int offset,
901 unsigned int len,
902 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903{
Alex Elderf7760da2012-10-20 22:17:27 -0500904 struct bio_vec *bv;
905 unsigned int resid;
906 unsigned short idx;
907 unsigned int voff;
908 unsigned short end_idx;
909 unsigned short vcnt;
910 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911
Alex Elderf7760da2012-10-20 22:17:27 -0500912 /* Handle the easy case for the caller */
913
914 if (!offset && len == bio_src->bi_size)
915 return bio_clone(bio_src, gfpmask);
916
917 if (WARN_ON_ONCE(!len))
918 return NULL;
919 if (WARN_ON_ONCE(len > bio_src->bi_size))
920 return NULL;
921 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
922 return NULL;
923
924 /* Find first affected segment... */
925
926 resid = offset;
927 __bio_for_each_segment(bv, bio_src, idx, 0) {
928 if (resid < bv->bv_len)
929 break;
930 resid -= bv->bv_len;
931 }
932 voff = resid;
933
934 /* ...and the last affected segment */
935
936 resid += len;
937 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
938 if (resid <= bv->bv_len)
939 break;
940 resid -= bv->bv_len;
941 }
942 vcnt = end_idx - idx + 1;
943
944 /* Build the clone */
945
946 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
947 if (!bio)
948 return NULL; /* ENOMEM */
949
950 bio->bi_bdev = bio_src->bi_bdev;
951 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
952 bio->bi_rw = bio_src->bi_rw;
953 bio->bi_flags |= 1 << BIO_CLONED;
954
955 /*
956 * Copy over our part of the bio_vec, then update the first
957 * and last (or only) entries.
958 */
959 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
960 vcnt * sizeof (struct bio_vec));
961 bio->bi_io_vec[0].bv_offset += voff;
962 if (vcnt > 1) {
963 bio->bi_io_vec[0].bv_len -= voff;
964 bio->bi_io_vec[vcnt - 1].bv_len = resid;
965 } else {
966 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 }
968
Alex Elderf7760da2012-10-20 22:17:27 -0500969 bio->bi_vcnt = vcnt;
970 bio->bi_size = len;
971 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700972
Alex Elderf7760da2012-10-20 22:17:27 -0500973 return bio;
974}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Alex Elderf7760da2012-10-20 22:17:27 -0500976/*
977 * Clone a portion of a bio chain, starting at the given byte offset
978 * into the first bio in the source chain and continuing for the
979 * number of bytes indicated. The result is another bio chain of
980 * exactly the given length, or a null pointer on error.
981 *
982 * The bio_src and offset parameters are both in-out. On entry they
983 * refer to the first source bio and the offset into that bio where
984 * the start of data to be cloned is located.
985 *
986 * On return, bio_src is updated to refer to the bio in the source
987 * chain that contains first un-cloned byte, and *offset will
988 * contain the offset of that byte within that bio.
989 */
990static struct bio *bio_chain_clone_range(struct bio **bio_src,
991 unsigned int *offset,
992 unsigned int len,
993 gfp_t gfpmask)
994{
995 struct bio *bi = *bio_src;
996 unsigned int off = *offset;
997 struct bio *chain = NULL;
998 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Alex Elderf7760da2012-10-20 22:17:27 -05001000 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001001
Alex Elderf7760da2012-10-20 22:17:27 -05001002 if (!bi || off >= bi->bi_size || !len)
1003 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004
Alex Elderf7760da2012-10-20 22:17:27 -05001005 end = &chain;
1006 while (len) {
1007 unsigned int bi_size;
1008 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001009
Alex Elderf5400b72012-11-01 10:17:15 -05001010 if (!bi) {
1011 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001012 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001013 }
Alex Elderf7760da2012-10-20 22:17:27 -05001014 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1015 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1016 if (!bio)
1017 goto out_err; /* ENOMEM */
1018
1019 *end = bio;
1020 end = &bio->bi_next;
1021
1022 off += bi_size;
1023 if (off == bi->bi_size) {
1024 bi = bi->bi_next;
1025 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026 }
Alex Elderf7760da2012-10-20 22:17:27 -05001027 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028 }
Alex Elderf7760da2012-10-20 22:17:27 -05001029 *bio_src = bi;
1030 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031
Alex Elderf7760da2012-10-20 22:17:27 -05001032 return chain;
1033out_err:
1034 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036 return NULL;
1037}
1038
Alex Elderbf0d5f502012-11-22 00:00:08 -06001039static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1040{
1041 kref_get(&obj_request->kref);
1042}
1043
1044static void rbd_obj_request_destroy(struct kref *kref);
1045static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1046{
1047 rbd_assert(obj_request != NULL);
1048 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1049}
1050
1051static void rbd_img_request_get(struct rbd_img_request *img_request)
1052{
1053 kref_get(&img_request->kref);
1054}
1055
1056static void rbd_img_request_destroy(struct kref *kref);
1057static void rbd_img_request_put(struct rbd_img_request *img_request)
1058{
1059 rbd_assert(img_request != NULL);
1060 kref_put(&img_request->kref, rbd_img_request_destroy);
1061}
1062
1063static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1064 struct rbd_obj_request *obj_request)
1065{
1066 rbd_obj_request_get(obj_request);
1067 obj_request->img_request = img_request;
1068 list_add_tail(&obj_request->links, &img_request->obj_requests);
1069 obj_request->which = img_request->obj_request_count++;
1070 rbd_assert(obj_request->which != BAD_WHICH);
1071}
1072
1073static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1074 struct rbd_obj_request *obj_request)
1075{
1076 rbd_assert(obj_request->which != BAD_WHICH);
1077 obj_request->which = BAD_WHICH;
1078 list_del(&obj_request->links);
1079 rbd_assert(obj_request->img_request == img_request);
1080 obj_request->callback = NULL;
1081 obj_request->img_request = NULL;
1082 rbd_obj_request_put(obj_request);
1083}
1084
1085static bool obj_request_type_valid(enum obj_request_type type)
1086{
1087 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001088 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001089 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001090 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001091 return true;
1092 default:
1093 return false;
1094 }
1095}
1096
Alex Elder8d23bf22012-11-19 22:55:21 -06001097struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1098{
1099 struct ceph_osd_req_op *op;
1100 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001101 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001102
1103 op = kzalloc(sizeof (*op), GFP_NOIO);
1104 if (!op)
1105 return NULL;
1106 op->op = opcode;
1107 va_start(args, opcode);
1108 switch (opcode) {
1109 case CEPH_OSD_OP_READ:
1110 case CEPH_OSD_OP_WRITE:
1111 /* rbd_osd_req_op_create(READ, offset, length) */
1112 /* rbd_osd_req_op_create(WRITE, offset, length) */
1113 op->extent.offset = va_arg(args, u64);
1114 op->extent.length = va_arg(args, u64);
1115 if (opcode == CEPH_OSD_OP_WRITE)
1116 op->payload_len = op->extent.length;
1117 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001118 case CEPH_OSD_OP_CALL:
1119 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1120 op->cls.class_name = va_arg(args, char *);
1121 size = strlen(op->cls.class_name);
1122 rbd_assert(size <= (size_t) U8_MAX);
1123 op->cls.class_len = size;
1124 op->payload_len = size;
1125
1126 op->cls.method_name = va_arg(args, char *);
1127 size = strlen(op->cls.method_name);
1128 rbd_assert(size <= (size_t) U8_MAX);
1129 op->cls.method_len = size;
1130 op->payload_len += size;
1131
1132 op->cls.argc = 0;
1133 op->cls.indata = va_arg(args, void *);
1134 size = va_arg(args, size_t);
1135 rbd_assert(size <= (size_t) U32_MAX);
1136 op->cls.indata_len = (u32) size;
1137 op->payload_len += size;
1138 break;
Alex Elder5efea492012-11-19 22:55:21 -06001139 case CEPH_OSD_OP_NOTIFY_ACK:
1140 case CEPH_OSD_OP_WATCH:
1141 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1142 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1143 op->watch.cookie = va_arg(args, u64);
1144 op->watch.ver = va_arg(args, u64);
1145 op->watch.ver = cpu_to_le64(op->watch.ver);
1146 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1147 op->watch.flag = (u8) 1;
1148 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001149 default:
1150 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1151 kfree(op);
1152 op = NULL;
1153 break;
1154 }
1155 va_end(args);
1156
1157 return op;
1158}
1159
1160static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1161{
1162 kfree(op);
1163}
1164
Alex Elderbf0d5f502012-11-22 00:00:08 -06001165static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1166 struct rbd_obj_request *obj_request)
1167{
1168 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1169}
1170
1171static void rbd_img_request_complete(struct rbd_img_request *img_request)
1172{
1173 if (img_request->callback)
1174 img_request->callback(img_request);
1175 else
1176 rbd_img_request_put(img_request);
1177}
1178
Alex Elder788e2df2013-01-17 12:25:27 -06001179/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1180
1181static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1182{
1183 return wait_for_completion_interruptible(&obj_request->completion);
1184}
1185
Alex Elder9969ebc2013-01-18 12:31:10 -06001186static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
1187 struct ceph_osd_op *op)
1188{
1189 atomic_set(&obj_request->done, 1);
1190}
1191
Alex Elderbf0d5f502012-11-22 00:00:08 -06001192static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1193{
1194 if (obj_request->callback)
1195 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001196 else
1197 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001198}
1199
Alex Elderbf0d5f502012-11-22 00:00:08 -06001200static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1201 struct ceph_osd_op *op)
1202{
1203 u64 xferred;
1204
1205 /*
1206 * We support a 64-bit length, but ultimately it has to be
1207 * passed to blk_end_request(), which takes an unsigned int.
1208 */
1209 xferred = le64_to_cpu(op->extent.length);
1210 rbd_assert(xferred < (u64) UINT_MAX);
1211 if (obj_request->result == (s32) -ENOENT) {
1212 zero_bio_chain(obj_request->bio_list, 0);
1213 obj_request->result = 0;
1214 } else if (xferred < obj_request->length && !obj_request->result) {
1215 zero_bio_chain(obj_request->bio_list, xferred);
1216 xferred = obj_request->length;
1217 }
1218 obj_request->xferred = xferred;
1219 atomic_set(&obj_request->done, 1);
1220}
1221
1222static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1223 struct ceph_osd_op *op)
1224{
1225 obj_request->xferred = le64_to_cpu(op->extent.length);
1226 atomic_set(&obj_request->done, 1);
1227}
1228
1229static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1230 struct ceph_msg *msg)
1231{
1232 struct rbd_obj_request *obj_request = osd_req->r_priv;
1233 struct ceph_osd_reply_head *reply_head;
1234 struct ceph_osd_op *op;
1235 u32 num_ops;
1236 u16 opcode;
1237
1238 rbd_assert(osd_req == obj_request->osd_req);
1239 rbd_assert(!!obj_request->img_request ^
1240 (obj_request->which == BAD_WHICH));
1241
1242 obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1243 reply_head = msg->front.iov_base;
1244 obj_request->result = (s32) le32_to_cpu(reply_head->result);
1245 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1246
1247 num_ops = le32_to_cpu(reply_head->num_ops);
1248 WARN_ON(num_ops != 1); /* For now */
1249
1250 op = &reply_head->ops[0];
1251 opcode = le16_to_cpu(op->op);
1252 switch (opcode) {
1253 case CEPH_OSD_OP_READ:
1254 rbd_osd_read_callback(obj_request, op);
1255 break;
1256 case CEPH_OSD_OP_WRITE:
1257 rbd_osd_write_callback(obj_request, op);
1258 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001259 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001260 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001261 case CEPH_OSD_OP_WATCH:
1262 rbd_osd_trivial_callback(obj_request, op);
1263 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001264 default:
1265 rbd_warn(NULL, "%s: unsupported op %hu\n",
1266 obj_request->object_name, (unsigned short) opcode);
1267 break;
1268 }
1269
1270 if (atomic_read(&obj_request->done))
1271 rbd_obj_request_complete(obj_request);
1272}
1273
1274static struct ceph_osd_request *rbd_osd_req_create(
1275 struct rbd_device *rbd_dev,
1276 bool write_request,
1277 struct rbd_obj_request *obj_request,
1278 struct ceph_osd_req_op *op)
1279{
1280 struct rbd_img_request *img_request = obj_request->img_request;
1281 struct ceph_snap_context *snapc = NULL;
1282 struct ceph_osd_client *osdc;
1283 struct ceph_osd_request *osd_req;
1284 struct timespec now;
1285 struct timespec *mtime;
1286 u64 snap_id = CEPH_NOSNAP;
1287 u64 offset = obj_request->offset;
1288 u64 length = obj_request->length;
1289
1290 if (img_request) {
1291 rbd_assert(img_request->write_request == write_request);
1292 if (img_request->write_request)
1293 snapc = img_request->snapc;
1294 else
1295 snap_id = img_request->snap_id;
1296 }
1297
1298 /* Allocate and initialize the request, for the single op */
1299
1300 osdc = &rbd_dev->rbd_client->client->osdc;
1301 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1302 if (!osd_req)
1303 return NULL; /* ENOMEM */
1304
1305 rbd_assert(obj_request_type_valid(obj_request->type));
1306 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001307 case OBJ_REQUEST_NODATA:
1308 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001309 case OBJ_REQUEST_BIO:
1310 rbd_assert(obj_request->bio_list != NULL);
1311 osd_req->r_bio = obj_request->bio_list;
1312 bio_get(osd_req->r_bio);
1313 /* osd client requires "num pages" even for bio */
1314 osd_req->r_num_pages = calc_pages_for(offset, length);
1315 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001316 case OBJ_REQUEST_PAGES:
1317 osd_req->r_pages = obj_request->pages;
1318 osd_req->r_num_pages = obj_request->page_count;
1319 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1320 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001321 }
1322
1323 if (write_request) {
1324 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1325 now = CURRENT_TIME;
1326 mtime = &now;
1327 } else {
1328 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1329 mtime = NULL; /* not needed for reads */
1330 offset = 0; /* These are not used... */
1331 length = 0; /* ...for osd read requests */
1332 }
1333
1334 osd_req->r_callback = rbd_osd_req_callback;
1335 osd_req->r_priv = obj_request;
1336
1337 osd_req->r_oid_len = strlen(obj_request->object_name);
1338 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1339 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1340
1341 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1342
1343 /* osd_req will get its own reference to snapc (if non-null) */
1344
1345 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1346 snapc, snap_id, mtime);
1347
1348 return osd_req;
1349}
1350
1351static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1352{
1353 ceph_osdc_put_request(osd_req);
1354}
1355
1356/* object_name is assumed to be a non-null pointer and NUL-terminated */
1357
1358static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1359 u64 offset, u64 length,
1360 enum obj_request_type type)
1361{
1362 struct rbd_obj_request *obj_request;
1363 size_t size;
1364 char *name;
1365
1366 rbd_assert(obj_request_type_valid(type));
1367
1368 size = strlen(object_name) + 1;
1369 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1370 if (!obj_request)
1371 return NULL;
1372
1373 name = (char *)(obj_request + 1);
1374 obj_request->object_name = memcpy(name, object_name, size);
1375 obj_request->offset = offset;
1376 obj_request->length = length;
1377 obj_request->which = BAD_WHICH;
1378 obj_request->type = type;
1379 INIT_LIST_HEAD(&obj_request->links);
1380 atomic_set(&obj_request->done, 0);
Alex Elder788e2df2013-01-17 12:25:27 -06001381 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001382 kref_init(&obj_request->kref);
1383
1384 return obj_request;
1385}
1386
1387static void rbd_obj_request_destroy(struct kref *kref)
1388{
1389 struct rbd_obj_request *obj_request;
1390
1391 obj_request = container_of(kref, struct rbd_obj_request, kref);
1392
1393 rbd_assert(obj_request->img_request == NULL);
1394 rbd_assert(obj_request->which == BAD_WHICH);
1395
1396 if (obj_request->osd_req)
1397 rbd_osd_req_destroy(obj_request->osd_req);
1398
1399 rbd_assert(obj_request_type_valid(obj_request->type));
1400 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001401 case OBJ_REQUEST_NODATA:
1402 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001403 case OBJ_REQUEST_BIO:
1404 if (obj_request->bio_list)
1405 bio_chain_put(obj_request->bio_list);
1406 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001407 case OBJ_REQUEST_PAGES:
1408 if (obj_request->pages)
1409 ceph_release_page_vector(obj_request->pages,
1410 obj_request->page_count);
1411 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001412 }
1413
1414 kfree(obj_request);
1415}
1416
1417/*
1418 * Caller is responsible for filling in the list of object requests
1419 * that comprises the image request, and the Linux request pointer
1420 * (if there is one).
1421 */
1422struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1423 u64 offset, u64 length,
1424 bool write_request)
1425{
1426 struct rbd_img_request *img_request;
1427 struct ceph_snap_context *snapc = NULL;
1428
1429 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1430 if (!img_request)
1431 return NULL;
1432
1433 if (write_request) {
1434 down_read(&rbd_dev->header_rwsem);
1435 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1436 up_read(&rbd_dev->header_rwsem);
1437 if (WARN_ON(!snapc)) {
1438 kfree(img_request);
1439 return NULL; /* Shouldn't happen */
1440 }
1441 }
1442
1443 img_request->rq = NULL;
1444 img_request->rbd_dev = rbd_dev;
1445 img_request->offset = offset;
1446 img_request->length = length;
1447 img_request->write_request = write_request;
1448 if (write_request)
1449 img_request->snapc = snapc;
1450 else
1451 img_request->snap_id = rbd_dev->spec->snap_id;
1452 spin_lock_init(&img_request->completion_lock);
1453 img_request->next_completion = 0;
1454 img_request->callback = NULL;
1455 img_request->obj_request_count = 0;
1456 INIT_LIST_HEAD(&img_request->obj_requests);
1457 kref_init(&img_request->kref);
1458
1459 rbd_img_request_get(img_request); /* Avoid a warning */
1460 rbd_img_request_put(img_request); /* TEMPORARY */
1461
1462 return img_request;
1463}
1464
1465static void rbd_img_request_destroy(struct kref *kref)
1466{
1467 struct rbd_img_request *img_request;
1468 struct rbd_obj_request *obj_request;
1469 struct rbd_obj_request *next_obj_request;
1470
1471 img_request = container_of(kref, struct rbd_img_request, kref);
1472
1473 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1474 rbd_img_obj_request_del(img_request, obj_request);
1475
1476 if (img_request->write_request)
1477 ceph_put_snap_context(img_request->snapc);
1478
1479 kfree(img_request);
1480}
1481
1482static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1483 struct bio *bio_list)
1484{
1485 struct rbd_device *rbd_dev = img_request->rbd_dev;
1486 struct rbd_obj_request *obj_request = NULL;
1487 struct rbd_obj_request *next_obj_request;
1488 unsigned int bio_offset;
1489 u64 image_offset;
1490 u64 resid;
1491 u16 opcode;
1492
1493 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1494 : CEPH_OSD_OP_READ;
1495 bio_offset = 0;
1496 image_offset = img_request->offset;
1497 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1498 resid = img_request->length;
1499 while (resid) {
1500 const char *object_name;
1501 unsigned int clone_size;
1502 struct ceph_osd_req_op *op;
1503 u64 offset;
1504 u64 length;
1505
1506 object_name = rbd_segment_name(rbd_dev, image_offset);
1507 if (!object_name)
1508 goto out_unwind;
1509 offset = rbd_segment_offset(rbd_dev, image_offset);
1510 length = rbd_segment_length(rbd_dev, image_offset, resid);
1511 obj_request = rbd_obj_request_create(object_name,
1512 offset, length,
1513 OBJ_REQUEST_BIO);
1514 kfree(object_name); /* object request has its own copy */
1515 if (!obj_request)
1516 goto out_unwind;
1517
1518 rbd_assert(length <= (u64) UINT_MAX);
1519 clone_size = (unsigned int) length;
1520 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1521 &bio_offset, clone_size,
1522 GFP_ATOMIC);
1523 if (!obj_request->bio_list)
1524 goto out_partial;
1525
1526 /*
1527 * Build up the op to use in building the osd
1528 * request. Note that the contents of the op are
1529 * copied by rbd_osd_req_create().
1530 */
1531 op = rbd_osd_req_op_create(opcode, offset, length);
1532 if (!op)
1533 goto out_partial;
1534 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1535 img_request->write_request,
1536 obj_request, op);
1537 rbd_osd_req_op_destroy(op);
1538 if (!obj_request->osd_req)
1539 goto out_partial;
1540 /* status and version are initially zero-filled */
1541
1542 rbd_img_obj_request_add(img_request, obj_request);
1543
1544 image_offset += length;
1545 resid -= length;
1546 }
1547
1548 return 0;
1549
1550out_partial:
1551 rbd_obj_request_put(obj_request);
1552out_unwind:
1553 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1554 rbd_obj_request_put(obj_request);
1555
1556 return -ENOMEM;
1557}
1558
1559static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1560{
1561 struct rbd_img_request *img_request;
1562 u32 which = obj_request->which;
1563 bool more = true;
1564
1565 img_request = obj_request->img_request;
1566 rbd_assert(img_request != NULL);
1567 rbd_assert(img_request->rq != NULL);
1568 rbd_assert(which != BAD_WHICH);
1569 rbd_assert(which < img_request->obj_request_count);
1570 rbd_assert(which >= img_request->next_completion);
1571
1572 spin_lock_irq(&img_request->completion_lock);
1573 if (which != img_request->next_completion)
1574 goto out;
1575
1576 for_each_obj_request_from(img_request, obj_request) {
1577 unsigned int xferred;
1578 int result;
1579
1580 rbd_assert(more);
1581 rbd_assert(which < img_request->obj_request_count);
1582
1583 if (!atomic_read(&obj_request->done))
1584 break;
1585
1586 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1587 xferred = (unsigned int) obj_request->xferred;
1588 result = (int) obj_request->result;
1589 if (result)
1590 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1591 img_request->write_request ? "write" : "read",
1592 result, xferred);
1593
1594 more = blk_end_request(img_request->rq, result, xferred);
1595 which++;
1596 }
1597 rbd_assert(more ^ (which == img_request->obj_request_count));
1598 img_request->next_completion = which;
1599out:
1600 spin_unlock_irq(&img_request->completion_lock);
1601
1602 if (!more)
1603 rbd_img_request_complete(img_request);
1604}
1605
1606static int rbd_img_request_submit(struct rbd_img_request *img_request)
1607{
1608 struct rbd_device *rbd_dev = img_request->rbd_dev;
1609 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1610 struct rbd_obj_request *obj_request;
1611
1612 for_each_obj_request(img_request, obj_request) {
1613 int ret;
1614
1615 obj_request->callback = rbd_img_obj_callback;
1616 ret = rbd_obj_request_submit(osdc, obj_request);
1617 if (ret)
1618 return ret;
1619 /*
1620 * The image request has its own reference to each
1621 * of its object requests, so we can safely drop the
1622 * initial one here.
1623 */
1624 rbd_obj_request_put(obj_request);
1625 }
1626
1627 return 0;
1628}
1629
Alex Eldercf81b602013-01-17 12:18:46 -06001630static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06001631 u64 ver, u64 notify_id)
1632{
1633 struct rbd_obj_request *obj_request;
1634 struct ceph_osd_req_op *op;
1635 struct ceph_osd_client *osdc;
1636 int ret;
1637
1638 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1639 OBJ_REQUEST_NODATA);
1640 if (!obj_request)
1641 return -ENOMEM;
1642
1643 ret = -ENOMEM;
1644 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1645 if (!op)
1646 goto out;
1647 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1648 obj_request, op);
1649 rbd_osd_req_op_destroy(op);
1650 if (!obj_request->osd_req)
1651 goto out;
1652
1653 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Eldercf81b602013-01-17 12:18:46 -06001654 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06001655 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001656out:
Alex Eldercf81b602013-01-17 12:18:46 -06001657 if (ret)
1658 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001659
1660 return ret;
1661}
1662
1663static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1664{
1665 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1666 u64 hver;
1667 int rc;
1668
1669 if (!rbd_dev)
1670 return;
1671
1672 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1673 rbd_dev->header_name, (unsigned long long) notify_id,
1674 (unsigned int) opcode);
1675 rc = rbd_dev_refresh(rbd_dev, &hver);
1676 if (rc)
1677 rbd_warn(rbd_dev, "got notification but failed to "
1678 " update snaps: %d\n", rc);
1679
Alex Eldercf81b602013-01-17 12:18:46 -06001680 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06001681}
1682
Alex Elder9969ebc2013-01-18 12:31:10 -06001683/*
1684 * Request sync osd watch/unwatch. The value of "start" determines
1685 * whether a watch request is being initiated or torn down.
1686 */
1687static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1688{
1689 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1690 struct rbd_obj_request *obj_request;
1691 struct ceph_osd_req_op *op;
1692 int ret;
1693
1694 rbd_assert(start ^ !!rbd_dev->watch_event);
1695 rbd_assert(start ^ !!rbd_dev->watch_request);
1696
1697 if (start) {
1698 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1699 &rbd_dev->watch_event);
1700 if (ret < 0)
1701 return ret;
1702 }
1703
1704 ret = -ENOMEM;
1705 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1706 OBJ_REQUEST_NODATA);
1707 if (!obj_request)
1708 goto out_cancel;
1709
1710 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1711 rbd_dev->watch_event->cookie,
1712 rbd_dev->header.obj_version, start);
1713 if (!op)
1714 goto out_cancel;
1715 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1716 obj_request, op);
1717 rbd_osd_req_op_destroy(op);
1718 if (!obj_request->osd_req)
1719 goto out_cancel;
1720
1721 if (start) {
Alex Elder975241a2013-01-25 17:08:55 -06001722 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
1723 rbd_dev->watch_request = obj_request;
Alex Elder6977c3f2013-01-25 17:08:55 -06001724 } else {
1725 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06001726 rbd_dev->watch_request->osd_req);
Alex Elder6977c3f2013-01-25 17:08:55 -06001727 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06001728 }
1729 ret = rbd_obj_request_submit(osdc, obj_request);
1730 if (ret)
1731 goto out_cancel;
1732 ret = rbd_obj_request_wait(obj_request);
1733 if (ret)
1734 goto out_cancel;
1735
1736 ret = obj_request->result;
1737 if (ret)
1738 goto out_cancel;
1739
1740 if (start)
1741 goto done; /* Done if setting up the watch request */
1742out_cancel:
1743 /* Cancel the event if we're tearing down, or on error */
1744 ceph_osdc_cancel_event(rbd_dev->watch_event);
1745 rbd_dev->watch_event = NULL;
1746done:
1747 if (obj_request)
1748 rbd_obj_request_put(obj_request);
1749
1750 return ret;
1751}
1752
Alex Elder36be9a72013-01-19 00:30:28 -06001753/*
1754 * Synchronous osd object method call
1755 */
1756static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1757 const char *object_name,
1758 const char *class_name,
1759 const char *method_name,
1760 const char *outbound,
1761 size_t outbound_size,
1762 char *inbound,
1763 size_t inbound_size,
1764 u64 *version)
1765{
1766 struct rbd_obj_request *obj_request;
1767 struct ceph_osd_client *osdc;
1768 struct ceph_osd_req_op *op;
1769 struct page **pages;
1770 u32 page_count;
1771 int ret;
1772
1773 /*
1774 * Method calls are ultimately read operations but they
1775 * don't involve object data (so no offset or length).
1776 * The result should placed into the inbound buffer
1777 * provided. They also supply outbound data--parameters for
1778 * the object method. Currently if this is present it will
1779 * be a snapshot id.
1780 */
1781 page_count = (u32) calc_pages_for(0, inbound_size);
1782 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1783 if (IS_ERR(pages))
1784 return PTR_ERR(pages);
1785
1786 ret = -ENOMEM;
1787 obj_request = rbd_obj_request_create(object_name, 0, 0,
1788 OBJ_REQUEST_PAGES);
1789 if (!obj_request)
1790 goto out;
1791
1792 obj_request->pages = pages;
1793 obj_request->page_count = page_count;
1794
1795 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1796 method_name, outbound, outbound_size);
1797 if (!op)
1798 goto out;
1799 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1800 obj_request, op);
1801 rbd_osd_req_op_destroy(op);
1802 if (!obj_request->osd_req)
1803 goto out;
1804
1805 osdc = &rbd_dev->rbd_client->client->osdc;
1806 ret = rbd_obj_request_submit(osdc, obj_request);
1807 if (ret)
1808 goto out;
1809 ret = rbd_obj_request_wait(obj_request);
1810 if (ret)
1811 goto out;
1812
1813 ret = obj_request->result;
1814 if (ret < 0)
1815 goto out;
1816 ret = ceph_copy_from_page_vector(pages, inbound, 0,
1817 obj_request->xferred);
1818 if (version)
1819 *version = obj_request->version;
1820out:
1821 if (obj_request)
1822 rbd_obj_request_put(obj_request);
1823 else
1824 ceph_release_page_vector(pages, page_count);
1825
1826 return ret;
1827}
1828
Alex Elderbf0d5f502012-11-22 00:00:08 -06001829static void rbd_request_fn(struct request_queue *q)
1830{
1831 struct rbd_device *rbd_dev = q->queuedata;
1832 bool read_only = rbd_dev->mapping.read_only;
1833 struct request *rq;
1834 int result;
1835
1836 while ((rq = blk_fetch_request(q))) {
1837 bool write_request = rq_data_dir(rq) == WRITE;
1838 struct rbd_img_request *img_request;
1839 u64 offset;
1840 u64 length;
1841
1842 /* Ignore any non-FS requests that filter through. */
1843
1844 if (rq->cmd_type != REQ_TYPE_FS) {
1845 __blk_end_request_all(rq, 0);
1846 continue;
1847 }
1848
1849 spin_unlock_irq(q->queue_lock);
1850
1851 /* Disallow writes to a read-only device */
1852
1853 if (write_request) {
1854 result = -EROFS;
1855 if (read_only)
1856 goto end_request;
1857 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1858 }
1859
1860 /* Quit early if the snapshot has disappeared */
1861
1862 if (!atomic_read(&rbd_dev->exists)) {
1863 dout("request for non-existent snapshot");
1864 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1865 result = -ENXIO;
1866 goto end_request;
1867 }
1868
1869 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1870 length = (u64) blk_rq_bytes(rq);
1871
1872 result = -EINVAL;
1873 if (WARN_ON(offset && length > U64_MAX - offset + 1))
1874 goto end_request; /* Shouldn't happen */
1875
1876 result = -ENOMEM;
1877 img_request = rbd_img_request_create(rbd_dev, offset, length,
1878 write_request);
1879 if (!img_request)
1880 goto end_request;
1881
1882 img_request->rq = rq;
1883
1884 result = rbd_img_request_fill_bio(img_request, rq->bio);
1885 if (!result)
1886 result = rbd_img_request_submit(img_request);
1887 if (result)
1888 rbd_img_request_put(img_request);
1889end_request:
1890 spin_lock_irq(q->queue_lock);
1891 if (result < 0) {
1892 rbd_warn(rbd_dev, "obj_request %s result %d\n",
1893 write_request ? "write" : "read", result);
1894 __blk_end_request_all(rq, result);
1895 }
1896 }
1897}
1898
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001899/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001900 * a queue callback. Makes sure that we don't create a bio that spans across
1901 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001902 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001903 */
1904static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1905 struct bio_vec *bvec)
1906{
1907 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001908 sector_t sector_offset;
1909 sector_t sectors_per_obj;
1910 sector_t obj_sector_offset;
1911 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001912
Alex Eldere5cfeed22012-10-20 22:17:27 -05001913 /*
1914 * Find how far into its rbd object the partition-relative
1915 * bio start sector is to offset relative to the enclosing
1916 * device.
1917 */
1918 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1919 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1920 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001921
Alex Eldere5cfeed22012-10-20 22:17:27 -05001922 /*
1923 * Compute the number of bytes from that offset to the end
1924 * of the object. Account for what's already used by the bio.
1925 */
1926 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1927 if (ret > bmd->bi_size)
1928 ret -= bmd->bi_size;
1929 else
1930 ret = 0;
1931
1932 /*
1933 * Don't send back more than was asked for. And if the bio
1934 * was empty, let the whole thing through because: "Note
1935 * that a block device *must* allow a single page to be
1936 * added to an empty bio."
1937 */
1938 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1939 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1940 ret = (int) bvec->bv_len;
1941
1942 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001943}
1944
1945static void rbd_free_disk(struct rbd_device *rbd_dev)
1946{
1947 struct gendisk *disk = rbd_dev->disk;
1948
1949 if (!disk)
1950 return;
1951
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001952 if (disk->flags & GENHD_FL_UP)
1953 del_gendisk(disk);
1954 if (disk->queue)
1955 blk_cleanup_queue(disk->queue);
1956 put_disk(disk);
1957}
1958
Alex Elder788e2df2013-01-17 12:25:27 -06001959static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
1960 const char *object_name,
1961 u64 offset, u64 length,
1962 char *buf, u64 *version)
1963
1964{
1965 struct ceph_osd_req_op *op;
1966 struct rbd_obj_request *obj_request;
1967 struct ceph_osd_client *osdc;
1968 struct page **pages = NULL;
1969 u32 page_count;
1970 int ret;
1971
1972 page_count = (u32) calc_pages_for(offset, length);
1973 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1974 if (IS_ERR(pages))
1975 ret = PTR_ERR(pages);
1976
1977 ret = -ENOMEM;
1978 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06001979 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06001980 if (!obj_request)
1981 goto out;
1982
1983 obj_request->pages = pages;
1984 obj_request->page_count = page_count;
1985
1986 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
1987 if (!op)
1988 goto out;
1989 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1990 obj_request, op);
1991 rbd_osd_req_op_destroy(op);
1992 if (!obj_request->osd_req)
1993 goto out;
1994
1995 osdc = &rbd_dev->rbd_client->client->osdc;
1996 ret = rbd_obj_request_submit(osdc, obj_request);
1997 if (ret)
1998 goto out;
1999 ret = rbd_obj_request_wait(obj_request);
2000 if (ret)
2001 goto out;
2002
2003 ret = obj_request->result;
2004 if (ret < 0)
2005 goto out;
2006 ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
2007 if (version)
2008 *version = obj_request->version;
2009out:
2010 if (obj_request)
2011 rbd_obj_request_put(obj_request);
2012 else
2013 ceph_release_page_vector(pages, page_count);
2014
2015 return ret;
2016}
2017
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002018/*
Alex Elder4156d992012-08-02 11:29:46 -05002019 * Read the complete header for the given rbd device.
2020 *
2021 * Returns a pointer to a dynamically-allocated buffer containing
2022 * the complete and validated header. Caller can pass the address
2023 * of a variable that will be filled in with the version of the
2024 * header object at the time it was read.
2025 *
2026 * Returns a pointer-coded errno if a failure occurs.
2027 */
2028static struct rbd_image_header_ondisk *
2029rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2030{
2031 struct rbd_image_header_ondisk *ondisk = NULL;
2032 u32 snap_count = 0;
2033 u64 names_size = 0;
2034 u32 want_count;
2035 int ret;
2036
2037 /*
2038 * The complete header will include an array of its 64-bit
2039 * snapshot ids, followed by the names of those snapshots as
2040 * a contiguous block of NUL-terminated strings. Note that
2041 * the number of snapshots could change by the time we read
2042 * it in, in which case we re-read it.
2043 */
2044 do {
2045 size_t size;
2046
2047 kfree(ondisk);
2048
2049 size = sizeof (*ondisk);
2050 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2051 size += names_size;
2052 ondisk = kmalloc(size, GFP_KERNEL);
2053 if (!ondisk)
2054 return ERR_PTR(-ENOMEM);
2055
Alex Elder788e2df2013-01-17 12:25:27 -06002056 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002057 0, size,
2058 (char *) ondisk, version);
2059
2060 if (ret < 0)
2061 goto out_err;
2062 if (WARN_ON((size_t) ret < size)) {
2063 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002064 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2065 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002066 goto out_err;
2067 }
2068 if (!rbd_dev_ondisk_valid(ondisk)) {
2069 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002070 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002071 goto out_err;
2072 }
2073
2074 names_size = le64_to_cpu(ondisk->snap_names_len);
2075 want_count = snap_count;
2076 snap_count = le32_to_cpu(ondisk->snap_count);
2077 } while (snap_count != want_count);
2078
2079 return ondisk;
2080
2081out_err:
2082 kfree(ondisk);
2083
2084 return ERR_PTR(ret);
2085}
2086
2087/*
2088 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002089 */
2090static int rbd_read_header(struct rbd_device *rbd_dev,
2091 struct rbd_image_header *header)
2092{
Alex Elder4156d992012-08-02 11:29:46 -05002093 struct rbd_image_header_ondisk *ondisk;
2094 u64 ver = 0;
2095 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002096
Alex Elder4156d992012-08-02 11:29:46 -05002097 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2098 if (IS_ERR(ondisk))
2099 return PTR_ERR(ondisk);
2100 ret = rbd_header_from_disk(header, ondisk);
2101 if (ret >= 0)
2102 header->obj_version = ver;
2103 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002104
Alex Elder4156d992012-08-02 11:29:46 -05002105 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002106}
2107
Alex Elder41f38c22012-10-25 23:34:40 -05002108static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002109{
2110 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002111 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002112
Alex Eldera0593292012-07-19 09:09:27 -05002113 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002114 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002115}
2116
Alex Elder94785542012-10-09 13:50:17 -07002117static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2118{
2119 sector_t size;
2120
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002121 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002122 return;
2123
2124 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2125 dout("setting size to %llu sectors", (unsigned long long) size);
2126 rbd_dev->mapping.size = (u64) size;
2127 set_capacity(rbd_dev->disk, size);
2128}
2129
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002130/*
2131 * only read the first part of the ondisk header, without the snaps info
2132 */
Alex Elder117973f2012-08-31 17:29:55 -05002133static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002134{
2135 int ret;
2136 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002137
2138 ret = rbd_read_header(rbd_dev, &h);
2139 if (ret < 0)
2140 return ret;
2141
Josh Durgina51aa0c2011-12-05 10:35:04 -08002142 down_write(&rbd_dev->header_rwsem);
2143
Alex Elder94785542012-10-09 13:50:17 -07002144 /* Update image size, and check for resize of mapped image */
2145 rbd_dev->header.image_size = h.image_size;
2146 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07002147
Alex Elder849b4262012-07-09 21:04:24 -05002148 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002149 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05002150 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08002151 /* osd requests may still refer to snapc */
2152 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002153
Alex Elderb8136232012-07-25 09:32:41 -05002154 if (hver)
2155 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08002156 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08002157 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002158 rbd_dev->header.snapc = h.snapc;
2159 rbd_dev->header.snap_names = h.snap_names;
2160 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05002161 /* Free the extra copy of the object prefix */
2162 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2163 kfree(h.object_prefix);
2164
Alex Elder304f6802012-08-31 17:29:52 -05002165 ret = rbd_dev_snaps_update(rbd_dev);
2166 if (!ret)
2167 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002168
Josh Durginc6666012011-11-21 17:11:12 -08002169 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002170
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002171 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002172}
2173
Alex Elder117973f2012-08-31 17:29:55 -05002174static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05002175{
2176 int ret;
2177
Alex Elder117973f2012-08-31 17:29:55 -05002178 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05002179 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05002180 if (rbd_dev->image_format == 1)
2181 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2182 else
2183 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05002184 mutex_unlock(&ctl_mutex);
2185
2186 return ret;
2187}
2188
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002189static int rbd_init_disk(struct rbd_device *rbd_dev)
2190{
2191 struct gendisk *disk;
2192 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06002193 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002194
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002195 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002196 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2197 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002198 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002199
Alex Elderf0f8cef2012-01-29 13:57:44 -06002200 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05002201 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002202 disk->major = rbd_dev->major;
2203 disk->first_minor = 0;
2204 disk->fops = &rbd_bd_ops;
2205 disk->private_data = rbd_dev;
2206
Alex Elderbf0d5f502012-11-22 00:00:08 -06002207 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002208 if (!q)
2209 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07002210
Alex Elder593a9e72012-02-07 12:03:37 -06002211 /* We use the default size, but let's be explicit about it. */
2212 blk_queue_physical_block_size(q, SECTOR_SIZE);
2213
Josh Durgin029bcbd2011-07-22 11:35:23 -07002214 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06002215 segment_size = rbd_obj_bytes(&rbd_dev->header);
2216 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2217 blk_queue_max_segment_size(q, segment_size);
2218 blk_queue_io_min(q, segment_size);
2219 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07002220
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002221 blk_queue_merge_bvec(q, rbd_merge_bvec);
2222 disk->queue = q;
2223
2224 q->queuedata = rbd_dev;
2225
2226 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002227
Alex Elder12f02942012-08-29 17:11:07 -05002228 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2229
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002230 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002231out_disk:
2232 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002233
2234 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002235}
2236
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002237/*
2238 sysfs
2239*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002240
Alex Elder593a9e72012-02-07 12:03:37 -06002241static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2242{
2243 return container_of(dev, struct rbd_device, dev);
2244}
2245
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002246static ssize_t rbd_size_show(struct device *dev,
2247 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002248{
Alex Elder593a9e72012-02-07 12:03:37 -06002249 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08002250 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002251
Josh Durgina51aa0c2011-12-05 10:35:04 -08002252 down_read(&rbd_dev->header_rwsem);
2253 size = get_capacity(rbd_dev->disk);
2254 up_read(&rbd_dev->header_rwsem);
2255
2256 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002257}
2258
Alex Elder34b13182012-07-13 20:35:12 -05002259/*
2260 * Note this shows the features for whatever's mapped, which is not
2261 * necessarily the base image.
2262 */
2263static ssize_t rbd_features_show(struct device *dev,
2264 struct device_attribute *attr, char *buf)
2265{
2266 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2267
2268 return sprintf(buf, "0x%016llx\n",
2269 (unsigned long long) rbd_dev->mapping.features);
2270}
2271
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002272static ssize_t rbd_major_show(struct device *dev,
2273 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002274{
Alex Elder593a9e72012-02-07 12:03:37 -06002275 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002276
2277 return sprintf(buf, "%d\n", rbd_dev->major);
2278}
2279
2280static ssize_t rbd_client_id_show(struct device *dev,
2281 struct device_attribute *attr, char *buf)
2282{
Alex Elder593a9e72012-02-07 12:03:37 -06002283 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002284
Alex Elder1dbb4392012-01-24 10:08:37 -06002285 return sprintf(buf, "client%lld\n",
2286 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002287}
2288
2289static ssize_t rbd_pool_show(struct device *dev,
2290 struct device_attribute *attr, char *buf)
2291{
Alex Elder593a9e72012-02-07 12:03:37 -06002292 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002293
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002294 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002295}
2296
Alex Elder9bb2f332012-07-12 10:46:35 -05002297static ssize_t rbd_pool_id_show(struct device *dev,
2298 struct device_attribute *attr, char *buf)
2299{
2300 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2301
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002302 return sprintf(buf, "%llu\n",
2303 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002304}
2305
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002306static ssize_t rbd_name_show(struct device *dev,
2307 struct device_attribute *attr, char *buf)
2308{
Alex Elder593a9e72012-02-07 12:03:37 -06002309 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002310
Alex Eldera92ffdf2012-10-30 19:40:33 -05002311 if (rbd_dev->spec->image_name)
2312 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2313
2314 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002315}
2316
Alex Elder589d30e2012-07-10 20:30:11 -05002317static ssize_t rbd_image_id_show(struct device *dev,
2318 struct device_attribute *attr, char *buf)
2319{
2320 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2321
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002322 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002323}
2324
Alex Elder34b13182012-07-13 20:35:12 -05002325/*
2326 * Shows the name of the currently-mapped snapshot (or
2327 * RBD_SNAP_HEAD_NAME for the base image).
2328 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002329static ssize_t rbd_snap_show(struct device *dev,
2330 struct device_attribute *attr,
2331 char *buf)
2332{
Alex Elder593a9e72012-02-07 12:03:37 -06002333 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002334
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002335 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002336}
2337
Alex Elder86b00e02012-10-25 23:34:42 -05002338/*
2339 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2340 * for the parent image. If there is no parent, simply shows
2341 * "(no parent image)".
2342 */
2343static ssize_t rbd_parent_show(struct device *dev,
2344 struct device_attribute *attr,
2345 char *buf)
2346{
2347 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2348 struct rbd_spec *spec = rbd_dev->parent_spec;
2349 int count;
2350 char *bufp = buf;
2351
2352 if (!spec)
2353 return sprintf(buf, "(no parent image)\n");
2354
2355 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2356 (unsigned long long) spec->pool_id, spec->pool_name);
2357 if (count < 0)
2358 return count;
2359 bufp += count;
2360
2361 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2362 spec->image_name ? spec->image_name : "(unknown)");
2363 if (count < 0)
2364 return count;
2365 bufp += count;
2366
2367 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2368 (unsigned long long) spec->snap_id, spec->snap_name);
2369 if (count < 0)
2370 return count;
2371 bufp += count;
2372
2373 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2374 if (count < 0)
2375 return count;
2376 bufp += count;
2377
2378 return (ssize_t) (bufp - buf);
2379}
2380
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002381static ssize_t rbd_image_refresh(struct device *dev,
2382 struct device_attribute *attr,
2383 const char *buf,
2384 size_t size)
2385{
Alex Elder593a9e72012-02-07 12:03:37 -06002386 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002387 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002388
Alex Elder117973f2012-08-31 17:29:55 -05002389 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002390
2391 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002392}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002393
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002394static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002395static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002396static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2397static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2398static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002399static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002400static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002401static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002402static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2403static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002404static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002405
2406static struct attribute *rbd_attrs[] = {
2407 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002408 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002409 &dev_attr_major.attr,
2410 &dev_attr_client_id.attr,
2411 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002412 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002413 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002414 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002415 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002416 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002417 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002418 NULL
2419};
2420
2421static struct attribute_group rbd_attr_group = {
2422 .attrs = rbd_attrs,
2423};
2424
2425static const struct attribute_group *rbd_attr_groups[] = {
2426 &rbd_attr_group,
2427 NULL
2428};
2429
2430static void rbd_sysfs_dev_release(struct device *dev)
2431{
2432}
2433
2434static struct device_type rbd_device_type = {
2435 .name = "rbd",
2436 .groups = rbd_attr_groups,
2437 .release = rbd_sysfs_dev_release,
2438};
2439
2440
2441/*
2442 sysfs - snapshots
2443*/
2444
2445static ssize_t rbd_snap_size_show(struct device *dev,
2446 struct device_attribute *attr,
2447 char *buf)
2448{
2449 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2450
Josh Durgin35915382011-12-05 18:25:13 -08002451 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002452}
2453
2454static ssize_t rbd_snap_id_show(struct device *dev,
2455 struct device_attribute *attr,
2456 char *buf)
2457{
2458 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2459
Josh Durgin35915382011-12-05 18:25:13 -08002460 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002461}
2462
Alex Elder34b13182012-07-13 20:35:12 -05002463static ssize_t rbd_snap_features_show(struct device *dev,
2464 struct device_attribute *attr,
2465 char *buf)
2466{
2467 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2468
2469 return sprintf(buf, "0x%016llx\n",
2470 (unsigned long long) snap->features);
2471}
2472
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002473static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2474static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002475static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002476
2477static struct attribute *rbd_snap_attrs[] = {
2478 &dev_attr_snap_size.attr,
2479 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002480 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002481 NULL,
2482};
2483
2484static struct attribute_group rbd_snap_attr_group = {
2485 .attrs = rbd_snap_attrs,
2486};
2487
2488static void rbd_snap_dev_release(struct device *dev)
2489{
2490 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2491 kfree(snap->name);
2492 kfree(snap);
2493}
2494
2495static const struct attribute_group *rbd_snap_attr_groups[] = {
2496 &rbd_snap_attr_group,
2497 NULL
2498};
2499
2500static struct device_type rbd_snap_device_type = {
2501 .groups = rbd_snap_attr_groups,
2502 .release = rbd_snap_dev_release,
2503};
2504
Alex Elder8b8fb992012-10-26 17:25:24 -05002505static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2506{
2507 kref_get(&spec->kref);
2508
2509 return spec;
2510}
2511
2512static void rbd_spec_free(struct kref *kref);
2513static void rbd_spec_put(struct rbd_spec *spec)
2514{
2515 if (spec)
2516 kref_put(&spec->kref, rbd_spec_free);
2517}
2518
2519static struct rbd_spec *rbd_spec_alloc(void)
2520{
2521 struct rbd_spec *spec;
2522
2523 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2524 if (!spec)
2525 return NULL;
2526 kref_init(&spec->kref);
2527
2528 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2529
2530 return spec;
2531}
2532
2533static void rbd_spec_free(struct kref *kref)
2534{
2535 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2536
2537 kfree(spec->pool_name);
2538 kfree(spec->image_id);
2539 kfree(spec->image_name);
2540 kfree(spec->snap_name);
2541 kfree(spec);
2542}
2543
Alex Elderc53d5892012-10-25 23:34:42 -05002544struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2545 struct rbd_spec *spec)
2546{
2547 struct rbd_device *rbd_dev;
2548
2549 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2550 if (!rbd_dev)
2551 return NULL;
2552
2553 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002554 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002555 INIT_LIST_HEAD(&rbd_dev->node);
2556 INIT_LIST_HEAD(&rbd_dev->snaps);
2557 init_rwsem(&rbd_dev->header_rwsem);
2558
2559 rbd_dev->spec = spec;
2560 rbd_dev->rbd_client = rbdc;
2561
Alex Elder0903e872012-11-14 12:25:19 -06002562 /* Initialize the layout used for all rbd requests */
2563
2564 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2565 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2566 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2567 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2568
Alex Elderc53d5892012-10-25 23:34:42 -05002569 return rbd_dev;
2570}
2571
2572static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2573{
Alex Elder86b00e02012-10-25 23:34:42 -05002574 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002575 kfree(rbd_dev->header_name);
2576 rbd_put_client(rbd_dev->rbd_client);
2577 rbd_spec_put(rbd_dev->spec);
2578 kfree(rbd_dev);
2579}
2580
Alex Elder304f6802012-08-31 17:29:52 -05002581static bool rbd_snap_registered(struct rbd_snap *snap)
2582{
2583 bool ret = snap->dev.type == &rbd_snap_device_type;
2584 bool reg = device_is_registered(&snap->dev);
2585
2586 rbd_assert(!ret ^ reg);
2587
2588 return ret;
2589}
2590
Alex Elder41f38c22012-10-25 23:34:40 -05002591static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002592{
2593 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002594 if (device_is_registered(&snap->dev))
2595 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002596}
2597
Alex Elder14e70852012-07-19 09:09:27 -05002598static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002599 struct device *parent)
2600{
2601 struct device *dev = &snap->dev;
2602 int ret;
2603
2604 dev->type = &rbd_snap_device_type;
2605 dev->parent = parent;
2606 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002607 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002608 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2609
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002610 ret = device_register(dev);
2611
2612 return ret;
2613}
2614
Alex Elder4e891e02012-07-10 20:30:10 -05002615static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002616 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002617 u64 snap_id, u64 snap_size,
2618 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002619{
Alex Elder4e891e02012-07-10 20:30:10 -05002620 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002621 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002622
2623 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002624 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002625 return ERR_PTR(-ENOMEM);
2626
2627 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002628 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002629 if (!snap->name)
2630 goto err;
2631
Alex Elderc8d18422012-07-10 20:30:11 -05002632 snap->id = snap_id;
2633 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002634 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002635
2636 return snap;
2637
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002638err:
2639 kfree(snap->name);
2640 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002641
2642 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002643}
2644
Alex Eldercd892122012-07-03 16:01:19 -05002645static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2646 u64 *snap_size, u64 *snap_features)
2647{
2648 char *snap_name;
2649
2650 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2651
2652 *snap_size = rbd_dev->header.snap_sizes[which];
2653 *snap_features = 0; /* No features for v1 */
2654
2655 /* Skip over names until we find the one we are looking for */
2656
2657 snap_name = rbd_dev->header.snap_names;
2658 while (which--)
2659 snap_name += strlen(snap_name) + 1;
2660
2661 return snap_name;
2662}
2663
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002664/*
Alex Elder9d475de2012-07-03 16:01:19 -05002665 * Get the size and object order for an image snapshot, or if
2666 * snap_id is CEPH_NOSNAP, gets this information for the base
2667 * image.
2668 */
2669static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2670 u8 *order, u64 *snap_size)
2671{
2672 __le64 snapid = cpu_to_le64(snap_id);
2673 int ret;
2674 struct {
2675 u8 order;
2676 __le64 size;
2677 } __attribute__ ((packed)) size_buf = { 0 };
2678
Alex Elder36be9a72013-01-19 00:30:28 -06002679 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05002680 "rbd", "get_size",
2681 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002682 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002683 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05002684 if (ret < 0)
2685 return ret;
2686
2687 *order = size_buf.order;
2688 *snap_size = le64_to_cpu(size_buf.size);
2689
2690 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2691 (unsigned long long) snap_id, (unsigned int) *order,
2692 (unsigned long long) *snap_size);
2693
2694 return 0;
2695}
2696
2697static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2698{
2699 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2700 &rbd_dev->header.obj_order,
2701 &rbd_dev->header.image_size);
2702}
2703
Alex Elder1e130192012-07-03 16:01:19 -05002704static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2705{
2706 void *reply_buf;
2707 int ret;
2708 void *p;
2709
2710 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2711 if (!reply_buf)
2712 return -ENOMEM;
2713
Alex Elder36be9a72013-01-19 00:30:28 -06002714 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder1e130192012-07-03 16:01:19 -05002715 "rbd", "get_object_prefix",
2716 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002717 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002718 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05002719 if (ret < 0)
2720 goto out;
Alex Elder36be9a72013-01-19 00:30:28 -06002721 ret = 0; /* rbd_obj_method_sync() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002722
2723 p = reply_buf;
2724 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2725 p + RBD_OBJ_PREFIX_LEN_MAX,
2726 NULL, GFP_NOIO);
2727
2728 if (IS_ERR(rbd_dev->header.object_prefix)) {
2729 ret = PTR_ERR(rbd_dev->header.object_prefix);
2730 rbd_dev->header.object_prefix = NULL;
2731 } else {
2732 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2733 }
2734
2735out:
2736 kfree(reply_buf);
2737
2738 return ret;
2739}
2740
Alex Elderb1b54022012-07-03 16:01:19 -05002741static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2742 u64 *snap_features)
2743{
2744 __le64 snapid = cpu_to_le64(snap_id);
2745 struct {
2746 __le64 features;
2747 __le64 incompat;
2748 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002749 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002750 int ret;
2751
Alex Elder36be9a72013-01-19 00:30:28 -06002752 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05002753 "rbd", "get_features",
2754 (char *) &snapid, sizeof (snapid),
2755 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002756 NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002757 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05002758 if (ret < 0)
2759 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002760
2761 incompat = le64_to_cpu(features_buf.incompat);
2762 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002763 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002764
Alex Elderb1b54022012-07-03 16:01:19 -05002765 *snap_features = le64_to_cpu(features_buf.features);
2766
2767 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2768 (unsigned long long) snap_id,
2769 (unsigned long long) *snap_features,
2770 (unsigned long long) le64_to_cpu(features_buf.incompat));
2771
2772 return 0;
2773}
2774
2775static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2776{
2777 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2778 &rbd_dev->header.features);
2779}
2780
Alex Elder86b00e02012-10-25 23:34:42 -05002781static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2782{
2783 struct rbd_spec *parent_spec;
2784 size_t size;
2785 void *reply_buf = NULL;
2786 __le64 snapid;
2787 void *p;
2788 void *end;
2789 char *image_id;
2790 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002791 int ret;
2792
2793 parent_spec = rbd_spec_alloc();
2794 if (!parent_spec)
2795 return -ENOMEM;
2796
2797 size = sizeof (__le64) + /* pool_id */
2798 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2799 sizeof (__le64) + /* snap_id */
2800 sizeof (__le64); /* overlap */
2801 reply_buf = kmalloc(size, GFP_KERNEL);
2802 if (!reply_buf) {
2803 ret = -ENOMEM;
2804 goto out_err;
2805 }
2806
2807 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06002808 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05002809 "rbd", "get_parent",
2810 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002811 (char *) reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002812 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05002813 if (ret < 0)
2814 goto out_err;
2815
2816 ret = -ERANGE;
2817 p = reply_buf;
2818 end = (char *) reply_buf + size;
2819 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2820 if (parent_spec->pool_id == CEPH_NOPOOL)
2821 goto out; /* No parent? No problem. */
2822
Alex Elder0903e872012-11-14 12:25:19 -06002823 /* The ceph file layout needs to fit pool id in 32 bits */
2824
2825 ret = -EIO;
2826 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2827 goto out;
2828
Alex Elder979ed482012-11-01 08:39:26 -05002829 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002830 if (IS_ERR(image_id)) {
2831 ret = PTR_ERR(image_id);
2832 goto out_err;
2833 }
2834 parent_spec->image_id = image_id;
2835 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2836 ceph_decode_64_safe(&p, end, overlap, out_err);
2837
2838 rbd_dev->parent_overlap = overlap;
2839 rbd_dev->parent_spec = parent_spec;
2840 parent_spec = NULL; /* rbd_dev now owns this */
2841out:
2842 ret = 0;
2843out_err:
2844 kfree(reply_buf);
2845 rbd_spec_put(parent_spec);
2846
2847 return ret;
2848}
2849
Alex Elder9e15b772012-10-30 19:40:33 -05002850static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2851{
2852 size_t image_id_size;
2853 char *image_id;
2854 void *p;
2855 void *end;
2856 size_t size;
2857 void *reply_buf = NULL;
2858 size_t len = 0;
2859 char *image_name = NULL;
2860 int ret;
2861
2862 rbd_assert(!rbd_dev->spec->image_name);
2863
Alex Elder69e7a022012-11-01 08:39:26 -05002864 len = strlen(rbd_dev->spec->image_id);
2865 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002866 image_id = kmalloc(image_id_size, GFP_KERNEL);
2867 if (!image_id)
2868 return NULL;
2869
2870 p = image_id;
2871 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002872 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002873
2874 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2875 reply_buf = kmalloc(size, GFP_KERNEL);
2876 if (!reply_buf)
2877 goto out;
2878
Alex Elder36be9a72013-01-19 00:30:28 -06002879 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05002880 "rbd", "dir_get_name",
2881 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06002882 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05002883 if (ret < 0)
2884 goto out;
2885 p = reply_buf;
2886 end = (char *) reply_buf + size;
2887 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2888 if (IS_ERR(image_name))
2889 image_name = NULL;
2890 else
2891 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2892out:
2893 kfree(reply_buf);
2894 kfree(image_id);
2895
2896 return image_name;
2897}
2898
2899/*
2900 * When a parent image gets probed, we only have the pool, image,
2901 * and snapshot ids but not the names of any of them. This call
2902 * is made later to fill in those names. It has to be done after
2903 * rbd_dev_snaps_update() has completed because some of the
2904 * information (in particular, snapshot name) is not available
2905 * until then.
2906 */
2907static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2908{
2909 struct ceph_osd_client *osdc;
2910 const char *name;
2911 void *reply_buf = NULL;
2912 int ret;
2913
2914 if (rbd_dev->spec->pool_name)
2915 return 0; /* Already have the names */
2916
2917 /* Look up the pool name */
2918
2919 osdc = &rbd_dev->rbd_client->client->osdc;
2920 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05002921 if (!name) {
2922 rbd_warn(rbd_dev, "there is no pool with id %llu",
2923 rbd_dev->spec->pool_id); /* Really a BUG() */
2924 return -EIO;
2925 }
Alex Elder9e15b772012-10-30 19:40:33 -05002926
2927 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2928 if (!rbd_dev->spec->pool_name)
2929 return -ENOMEM;
2930
2931 /* Fetch the image name; tolerate failure here */
2932
2933 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05002934 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05002935 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05002936 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05002937 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05002938
2939 /* Look up the snapshot name. */
2940
2941 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2942 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05002943 rbd_warn(rbd_dev, "no snapshot with id %llu",
2944 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05002945 ret = -EIO;
2946 goto out_err;
2947 }
2948 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2949 if(!rbd_dev->spec->snap_name)
2950 goto out_err;
2951
2952 return 0;
2953out_err:
2954 kfree(reply_buf);
2955 kfree(rbd_dev->spec->pool_name);
2956 rbd_dev->spec->pool_name = NULL;
2957
2958 return ret;
2959}
2960
Alex Elder6e14b1a2012-07-03 16:01:19 -05002961static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002962{
2963 size_t size;
2964 int ret;
2965 void *reply_buf;
2966 void *p;
2967 void *end;
2968 u64 seq;
2969 u32 snap_count;
2970 struct ceph_snap_context *snapc;
2971 u32 i;
2972
2973 /*
2974 * We'll need room for the seq value (maximum snapshot id),
2975 * snapshot count, and array of that many snapshot ids.
2976 * For now we have a fixed upper limit on the number we're
2977 * prepared to receive.
2978 */
2979 size = sizeof (__le64) + sizeof (__le32) +
2980 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2981 reply_buf = kzalloc(size, GFP_KERNEL);
2982 if (!reply_buf)
2983 return -ENOMEM;
2984
Alex Elder36be9a72013-01-19 00:30:28 -06002985 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder35d489f2012-07-03 16:01:19 -05002986 "rbd", "get_snapcontext",
2987 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002988 reply_buf, size, ver);
Alex Elder36be9a72013-01-19 00:30:28 -06002989 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05002990 if (ret < 0)
2991 goto out;
2992
2993 ret = -ERANGE;
2994 p = reply_buf;
2995 end = (char *) reply_buf + size;
2996 ceph_decode_64_safe(&p, end, seq, out);
2997 ceph_decode_32_safe(&p, end, snap_count, out);
2998
2999 /*
3000 * Make sure the reported number of snapshot ids wouldn't go
3001 * beyond the end of our buffer. But before checking that,
3002 * make sure the computed size of the snapshot context we
3003 * allocate is representable in a size_t.
3004 */
3005 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3006 / sizeof (u64)) {
3007 ret = -EINVAL;
3008 goto out;
3009 }
3010 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3011 goto out;
3012
3013 size = sizeof (struct ceph_snap_context) +
3014 snap_count * sizeof (snapc->snaps[0]);
3015 snapc = kmalloc(size, GFP_KERNEL);
3016 if (!snapc) {
3017 ret = -ENOMEM;
3018 goto out;
3019 }
3020
3021 atomic_set(&snapc->nref, 1);
3022 snapc->seq = seq;
3023 snapc->num_snaps = snap_count;
3024 for (i = 0; i < snap_count; i++)
3025 snapc->snaps[i] = ceph_decode_64(&p);
3026
3027 rbd_dev->header.snapc = snapc;
3028
3029 dout(" snap context seq = %llu, snap_count = %u\n",
3030 (unsigned long long) seq, (unsigned int) snap_count);
3031
3032out:
3033 kfree(reply_buf);
3034
3035 return 0;
3036}
3037
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003038static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3039{
3040 size_t size;
3041 void *reply_buf;
3042 __le64 snap_id;
3043 int ret;
3044 void *p;
3045 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003046 char *snap_name;
3047
3048 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3049 reply_buf = kmalloc(size, GFP_KERNEL);
3050 if (!reply_buf)
3051 return ERR_PTR(-ENOMEM);
3052
3053 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003054 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003055 "rbd", "get_snapshot_name",
3056 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003057 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003058 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003059 if (ret < 0)
3060 goto out;
3061
3062 p = reply_buf;
3063 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003064 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003065 if (IS_ERR(snap_name)) {
3066 ret = PTR_ERR(snap_name);
3067 goto out;
3068 } else {
3069 dout(" snap_id 0x%016llx snap_name = %s\n",
3070 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3071 }
3072 kfree(reply_buf);
3073
3074 return snap_name;
3075out:
3076 kfree(reply_buf);
3077
3078 return ERR_PTR(ret);
3079}
3080
3081static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3082 u64 *snap_size, u64 *snap_features)
3083{
Alex Eldere0b49862013-01-09 14:44:18 -06003084 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003085 u8 order;
3086 int ret;
3087
3088 snap_id = rbd_dev->header.snapc->snaps[which];
3089 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3090 if (ret)
3091 return ERR_PTR(ret);
3092 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3093 if (ret)
3094 return ERR_PTR(ret);
3095
3096 return rbd_dev_v2_snap_name(rbd_dev, which);
3097}
3098
3099static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3100 u64 *snap_size, u64 *snap_features)
3101{
3102 if (rbd_dev->image_format == 1)
3103 return rbd_dev_v1_snap_info(rbd_dev, which,
3104 snap_size, snap_features);
3105 if (rbd_dev->image_format == 2)
3106 return rbd_dev_v2_snap_info(rbd_dev, which,
3107 snap_size, snap_features);
3108 return ERR_PTR(-EINVAL);
3109}
3110
Alex Elder117973f2012-08-31 17:29:55 -05003111static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3112{
3113 int ret;
3114 __u8 obj_order;
3115
3116 down_write(&rbd_dev->header_rwsem);
3117
3118 /* Grab old order first, to see if it changes */
3119
3120 obj_order = rbd_dev->header.obj_order,
3121 ret = rbd_dev_v2_image_size(rbd_dev);
3122 if (ret)
3123 goto out;
3124 if (rbd_dev->header.obj_order != obj_order) {
3125 ret = -EIO;
3126 goto out;
3127 }
3128 rbd_update_mapping_size(rbd_dev);
3129
3130 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3131 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3132 if (ret)
3133 goto out;
3134 ret = rbd_dev_snaps_update(rbd_dev);
3135 dout("rbd_dev_snaps_update returned %d\n", ret);
3136 if (ret)
3137 goto out;
3138 ret = rbd_dev_snaps_register(rbd_dev);
3139 dout("rbd_dev_snaps_register returned %d\n", ret);
3140out:
3141 up_write(&rbd_dev->header_rwsem);
3142
3143 return ret;
3144}
3145
Alex Elder9d475de2012-07-03 16:01:19 -05003146/*
Alex Elder35938152012-08-02 11:29:46 -05003147 * Scan the rbd device's current snapshot list and compare it to the
3148 * newly-received snapshot context. Remove any existing snapshots
3149 * not present in the new snapshot context. Add a new snapshot for
3150 * any snaphots in the snapshot context not in the current list.
3151 * And verify there are no changes to snapshots we already know
3152 * about.
3153 *
3154 * Assumes the snapshots in the snapshot context are sorted by
3155 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3156 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003157 */
Alex Elder304f6802012-08-31 17:29:52 -05003158static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003159{
Alex Elder35938152012-08-02 11:29:46 -05003160 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3161 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05003162 struct list_head *head = &rbd_dev->snaps;
3163 struct list_head *links = head->next;
3164 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003165
Alex Elder9fcbb802012-08-23 23:48:49 -05003166 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05003167 while (index < snap_count || links != head) {
3168 u64 snap_id;
3169 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05003170 char *snap_name;
3171 u64 snap_size = 0;
3172 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003173
Alex Elder35938152012-08-02 11:29:46 -05003174 snap_id = index < snap_count ? snapc->snaps[index]
3175 : CEPH_NOSNAP;
3176 snap = links != head ? list_entry(links, struct rbd_snap, node)
3177 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05003178 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003179
Alex Elder35938152012-08-02 11:29:46 -05003180 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3181 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003182
Alex Elder35938152012-08-02 11:29:46 -05003183 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003184
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003185 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06003186 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05003187 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05003188 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003189 rbd_dev->spec->snap_id == snap->id ?
3190 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05003191 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003192
Alex Elder35938152012-08-02 11:29:46 -05003193 /* Done with this list entry; advance */
3194
3195 links = next;
3196 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003197 }
Alex Elder35938152012-08-02 11:29:46 -05003198
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003199 snap_name = rbd_dev_snap_info(rbd_dev, index,
3200 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05003201 if (IS_ERR(snap_name))
3202 return PTR_ERR(snap_name);
3203
Alex Elder9fcbb802012-08-23 23:48:49 -05003204 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3205 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05003206 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3207 struct rbd_snap *new_snap;
3208
3209 /* We haven't seen this snapshot before */
3210
Alex Elderc8d18422012-07-10 20:30:11 -05003211 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05003212 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05003213 if (IS_ERR(new_snap)) {
3214 int err = PTR_ERR(new_snap);
3215
3216 dout(" failed to add dev, error %d\n", err);
3217
3218 return err;
3219 }
Alex Elder35938152012-08-02 11:29:46 -05003220
3221 /* New goes before existing, or at end of list */
3222
Alex Elder9fcbb802012-08-23 23:48:49 -05003223 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05003224 if (snap)
3225 list_add_tail(&new_snap->node, &snap->node);
3226 else
Alex Elder523f3252012-08-30 00:16:37 -05003227 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05003228 } else {
3229 /* Already have this one */
3230
Alex Elder9fcbb802012-08-23 23:48:49 -05003231 dout(" already present\n");
3232
Alex Eldercd892122012-07-03 16:01:19 -05003233 rbd_assert(snap->size == snap_size);
Alex Elderaafb230e2012-09-06 16:00:54 -05003234 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05003235 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05003236
3237 /* Done with this list entry; advance */
3238
3239 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003240 }
Alex Elder35938152012-08-02 11:29:46 -05003241
3242 /* Advance to the next entry in the snapshot context */
3243
3244 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003245 }
Alex Elder9fcbb802012-08-23 23:48:49 -05003246 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003247
3248 return 0;
3249}
3250
Alex Elder304f6802012-08-31 17:29:52 -05003251/*
3252 * Scan the list of snapshots and register the devices for any that
3253 * have not already been registered.
3254 */
3255static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3256{
3257 struct rbd_snap *snap;
3258 int ret = 0;
3259
3260 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05003261 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3262 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05003263
3264 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3265 if (!rbd_snap_registered(snap)) {
3266 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3267 if (ret < 0)
3268 break;
3269 }
3270 }
3271 dout("%s: returning %d\n", __func__, ret);
3272
3273 return ret;
3274}
3275
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003276static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3277{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003278 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003279 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003280
3281 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003282
Alex Eldercd789ab2012-08-30 00:16:38 -05003283 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003284 dev->bus = &rbd_bus_type;
3285 dev->type = &rbd_device_type;
3286 dev->parent = &rbd_root_dev;
3287 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003288 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003289 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003290
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003291 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003292
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003293 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003294}
3295
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003296static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3297{
3298 device_unregister(&rbd_dev->dev);
3299}
3300
Alex Eldere2839302012-08-29 17:11:06 -05003301static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003302
3303/*
Alex Elder499afd52012-02-02 08:13:29 -06003304 * Get a unique rbd identifier for the given new rbd_dev, and add
3305 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003306 */
Alex Eldere2839302012-08-29 17:11:06 -05003307static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003308{
Alex Eldere2839302012-08-29 17:11:06 -05003309 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003310
3311 spin_lock(&rbd_dev_list_lock);
3312 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3313 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003314 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3315 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003316}
Alex Elderb7f23c32012-01-29 13:57:43 -06003317
Alex Elder1ddbe942012-01-29 13:57:44 -06003318/*
Alex Elder499afd52012-02-02 08:13:29 -06003319 * Remove an rbd_dev from the global list, and record that its
3320 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003321 */
Alex Eldere2839302012-08-29 17:11:06 -05003322static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003323{
Alex Elderd184f6b2012-01-29 13:57:44 -06003324 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003325 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003326 int max_id;
3327
Alex Elderaafb230e2012-09-06 16:00:54 -05003328 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003329
Alex Eldere2839302012-08-29 17:11:06 -05003330 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3331 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003332 spin_lock(&rbd_dev_list_lock);
3333 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003334
3335 /*
3336 * If the id being "put" is not the current maximum, there
3337 * is nothing special we need to do.
3338 */
Alex Eldere2839302012-08-29 17:11:06 -05003339 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003340 spin_unlock(&rbd_dev_list_lock);
3341 return;
3342 }
3343
3344 /*
3345 * We need to update the current maximum id. Search the
3346 * list to find out what it is. We're more likely to find
3347 * the maximum at the end, so search the list backward.
3348 */
3349 max_id = 0;
3350 list_for_each_prev(tmp, &rbd_dev_list) {
3351 struct rbd_device *rbd_dev;
3352
3353 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003354 if (rbd_dev->dev_id > max_id)
3355 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003356 }
Alex Elder499afd52012-02-02 08:13:29 -06003357 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003358
Alex Elder1ddbe942012-01-29 13:57:44 -06003359 /*
Alex Eldere2839302012-08-29 17:11:06 -05003360 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003361 * which case it now accurately reflects the new maximum.
3362 * Be careful not to overwrite the maximum value in that
3363 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003364 */
Alex Eldere2839302012-08-29 17:11:06 -05003365 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3366 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003367}
3368
Alex Eldera725f65e2012-02-02 08:13:30 -06003369/*
Alex Eldere28fff262012-02-02 08:13:30 -06003370 * Skips over white space at *buf, and updates *buf to point to the
3371 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003372 * the token (string of non-white space characters) found. Note
3373 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003374 */
3375static inline size_t next_token(const char **buf)
3376{
3377 /*
3378 * These are the characters that produce nonzero for
3379 * isspace() in the "C" and "POSIX" locales.
3380 */
3381 const char *spaces = " \f\n\r\t\v";
3382
3383 *buf += strspn(*buf, spaces); /* Find start of token */
3384
3385 return strcspn(*buf, spaces); /* Return token length */
3386}
3387
3388/*
3389 * Finds the next token in *buf, and if the provided token buffer is
3390 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003391 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3392 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003393 *
3394 * Returns the length of the token found (not including the '\0').
3395 * Return value will be 0 if no token is found, and it will be >=
3396 * token_size if the token would not fit.
3397 *
Alex Elder593a9e72012-02-07 12:03:37 -06003398 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003399 * found token. Note that this occurs even if the token buffer is
3400 * too small to hold it.
3401 */
3402static inline size_t copy_token(const char **buf,
3403 char *token,
3404 size_t token_size)
3405{
3406 size_t len;
3407
3408 len = next_token(buf);
3409 if (len < token_size) {
3410 memcpy(token, *buf, len);
3411 *(token + len) = '\0';
3412 }
3413 *buf += len;
3414
3415 return len;
3416}
3417
3418/*
Alex Elderea3352f2012-07-09 21:04:23 -05003419 * Finds the next token in *buf, dynamically allocates a buffer big
3420 * enough to hold a copy of it, and copies the token into the new
3421 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3422 * that a duplicate buffer is created even for a zero-length token.
3423 *
3424 * Returns a pointer to the newly-allocated duplicate, or a null
3425 * pointer if memory for the duplicate was not available. If
3426 * the lenp argument is a non-null pointer, the length of the token
3427 * (not including the '\0') is returned in *lenp.
3428 *
3429 * If successful, the *buf pointer will be updated to point beyond
3430 * the end of the found token.
3431 *
3432 * Note: uses GFP_KERNEL for allocation.
3433 */
3434static inline char *dup_token(const char **buf, size_t *lenp)
3435{
3436 char *dup;
3437 size_t len;
3438
3439 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003440 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003441 if (!dup)
3442 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003443 *(dup + len) = '\0';
3444 *buf += len;
3445
3446 if (lenp)
3447 *lenp = len;
3448
3449 return dup;
3450}
3451
3452/*
Alex Elder859c31d2012-10-25 23:34:42 -05003453 * Parse the options provided for an "rbd add" (i.e., rbd image
3454 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3455 * and the data written is passed here via a NUL-terminated buffer.
3456 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003457 *
Alex Elder859c31d2012-10-25 23:34:42 -05003458 * The information extracted from these options is recorded in
3459 * the other parameters which return dynamically-allocated
3460 * structures:
3461 * ceph_opts
3462 * The address of a pointer that will refer to a ceph options
3463 * structure. Caller must release the returned pointer using
3464 * ceph_destroy_options() when it is no longer needed.
3465 * rbd_opts
3466 * Address of an rbd options pointer. Fully initialized by
3467 * this function; caller must release with kfree().
3468 * spec
3469 * Address of an rbd image specification pointer. Fully
3470 * initialized by this function based on parsed options.
3471 * Caller must release with rbd_spec_put().
3472 *
3473 * The options passed take this form:
3474 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3475 * where:
3476 * <mon_addrs>
3477 * A comma-separated list of one or more monitor addresses.
3478 * A monitor address is an ip address, optionally followed
3479 * by a port number (separated by a colon).
3480 * I.e.: ip1[:port1][,ip2[:port2]...]
3481 * <options>
3482 * A comma-separated list of ceph and/or rbd options.
3483 * <pool_name>
3484 * The name of the rados pool containing the rbd image.
3485 * <image_name>
3486 * The name of the image in that pool to map.
3487 * <snap_id>
3488 * An optional snapshot id. If provided, the mapping will
3489 * present data from the image at the time that snapshot was
3490 * created. The image head is used if no snapshot id is
3491 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003492 */
Alex Elder859c31d2012-10-25 23:34:42 -05003493static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003494 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003495 struct rbd_options **opts,
3496 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003497{
Alex Elderd22f76e2012-07-12 10:46:35 -05003498 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003499 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003500 const char *mon_addrs;
3501 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003502 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003503 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003504 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003505 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003506
3507 /* The first four tokens are required */
3508
Alex Elder7ef32142012-02-02 08:13:30 -06003509 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05003510 if (!len) {
3511 rbd_warn(NULL, "no monitor address(es) provided");
3512 return -EINVAL;
3513 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003514 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003515 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003516 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003517
Alex Elderdc79b112012-10-25 23:34:41 -05003518 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003519 options = dup_token(&buf, NULL);
3520 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003521 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003522 if (!*options) {
3523 rbd_warn(NULL, "no options provided");
3524 goto out_err;
3525 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003526
Alex Elder859c31d2012-10-25 23:34:42 -05003527 spec = rbd_spec_alloc();
3528 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003529 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003530
3531 spec->pool_name = dup_token(&buf, NULL);
3532 if (!spec->pool_name)
3533 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003534 if (!*spec->pool_name) {
3535 rbd_warn(NULL, "no pool name provided");
3536 goto out_err;
3537 }
Alex Eldere28fff262012-02-02 08:13:30 -06003538
Alex Elder69e7a022012-11-01 08:39:26 -05003539 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003540 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003541 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003542 if (!*spec->image_name) {
3543 rbd_warn(NULL, "no image name provided");
3544 goto out_err;
3545 }
Alex Eldere28fff262012-02-02 08:13:30 -06003546
Alex Elderf28e5652012-10-25 23:34:41 -05003547 /*
3548 * Snapshot name is optional; default is to use "-"
3549 * (indicating the head/no snapshot).
3550 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003551 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003552 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003553 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3554 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003555 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003556 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003557 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003558 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003559 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003560 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003561 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003562 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003563
Alex Elder0ddebc02012-10-25 23:34:41 -05003564 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003565
Alex Elder4e9afeb2012-10-25 23:34:41 -05003566 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3567 if (!rbd_opts)
3568 goto out_mem;
3569
3570 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003571
Alex Elder859c31d2012-10-25 23:34:42 -05003572 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003573 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003574 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003575 if (IS_ERR(copts)) {
3576 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003577 goto out_err;
3578 }
Alex Elder859c31d2012-10-25 23:34:42 -05003579 kfree(options);
3580
3581 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003582 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003583 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003584
Alex Elderdc79b112012-10-25 23:34:41 -05003585 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003586out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003587 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003588out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003589 kfree(rbd_opts);
3590 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003591 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003592
Alex Elderdc79b112012-10-25 23:34:41 -05003593 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003594}
3595
Alex Elder589d30e2012-07-10 20:30:11 -05003596/*
3597 * An rbd format 2 image has a unique identifier, distinct from the
3598 * name given to it by the user. Internally, that identifier is
3599 * what's used to specify the names of objects related to the image.
3600 *
3601 * A special "rbd id" object is used to map an rbd image name to its
3602 * id. If that object doesn't exist, then there is no v2 rbd image
3603 * with the supplied name.
3604 *
3605 * This function will record the given rbd_dev's image_id field if
3606 * it can be determined, and in that case will return 0. If any
3607 * errors occur a negative errno will be returned and the rbd_dev's
3608 * image_id field will be unchanged (and should be NULL).
3609 */
3610static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3611{
3612 int ret;
3613 size_t size;
3614 char *object_name;
3615 void *response;
3616 void *p;
3617
3618 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003619 * When probing a parent image, the image id is already
3620 * known (and the image name likely is not). There's no
3621 * need to fetch the image id again in this case.
3622 */
3623 if (rbd_dev->spec->image_id)
3624 return 0;
3625
3626 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003627 * First, see if the format 2 image id file exists, and if
3628 * so, get the image's persistent id from it.
3629 */
Alex Elder69e7a022012-11-01 08:39:26 -05003630 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003631 object_name = kmalloc(size, GFP_NOIO);
3632 if (!object_name)
3633 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003634 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003635 dout("rbd id object name is %s\n", object_name);
3636
3637 /* Response will be an encoded string, which includes a length */
3638
3639 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3640 response = kzalloc(size, GFP_NOIO);
3641 if (!response) {
3642 ret = -ENOMEM;
3643 goto out;
3644 }
3645
Alex Elder36be9a72013-01-19 00:30:28 -06003646 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder589d30e2012-07-10 20:30:11 -05003647 "rbd", "get_id",
3648 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003649 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003650 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder589d30e2012-07-10 20:30:11 -05003651 if (ret < 0)
3652 goto out;
Alex Elder36be9a72013-01-19 00:30:28 -06003653 ret = 0; /* rbd_obj_method_sync() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003654
3655 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003656 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003657 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003658 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003659 if (IS_ERR(rbd_dev->spec->image_id)) {
3660 ret = PTR_ERR(rbd_dev->spec->image_id);
3661 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003662 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003663 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003664 }
3665out:
3666 kfree(response);
3667 kfree(object_name);
3668
3669 return ret;
3670}
3671
Alex Eldera30b71b2012-07-10 20:30:11 -05003672static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3673{
3674 int ret;
3675 size_t size;
3676
3677 /* Version 1 images have no id; empty string is used */
3678
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003679 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3680 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003681 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003682
3683 /* Record the header object name for this rbd image. */
3684
Alex Elder69e7a022012-11-01 08:39:26 -05003685 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003686 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3687 if (!rbd_dev->header_name) {
3688 ret = -ENOMEM;
3689 goto out_err;
3690 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003691 sprintf(rbd_dev->header_name, "%s%s",
3692 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003693
3694 /* Populate rbd image metadata */
3695
3696 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3697 if (ret < 0)
3698 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003699
3700 /* Version 1 images have no parent (no layering) */
3701
3702 rbd_dev->parent_spec = NULL;
3703 rbd_dev->parent_overlap = 0;
3704
Alex Eldera30b71b2012-07-10 20:30:11 -05003705 rbd_dev->image_format = 1;
3706
3707 dout("discovered version 1 image, header name is %s\n",
3708 rbd_dev->header_name);
3709
3710 return 0;
3711
3712out_err:
3713 kfree(rbd_dev->header_name);
3714 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003715 kfree(rbd_dev->spec->image_id);
3716 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003717
3718 return ret;
3719}
3720
3721static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3722{
3723 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003724 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003725 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003726
3727 /*
3728 * Image id was filled in by the caller. Record the header
3729 * object name for this rbd image.
3730 */
Alex Elder979ed482012-11-01 08:39:26 -05003731 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003732 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3733 if (!rbd_dev->header_name)
3734 return -ENOMEM;
3735 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003736 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003737
3738 /* Get the size and object order for the image */
3739
3740 ret = rbd_dev_v2_image_size(rbd_dev);
3741 if (ret < 0)
3742 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003743
3744 /* Get the object prefix (a.k.a. block_name) for the image */
3745
3746 ret = rbd_dev_v2_object_prefix(rbd_dev);
3747 if (ret < 0)
3748 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003749
Alex Elderd8891402012-10-09 13:50:17 -07003750 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003751
3752 ret = rbd_dev_v2_features(rbd_dev);
3753 if (ret < 0)
3754 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003755
Alex Elder86b00e02012-10-25 23:34:42 -05003756 /* If the image supports layering, get the parent info */
3757
3758 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3759 ret = rbd_dev_v2_parent_info(rbd_dev);
3760 if (ret < 0)
3761 goto out_err;
3762 }
3763
Alex Elder6e14b1a2012-07-03 16:01:19 -05003764 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003765
Alex Elder6e14b1a2012-07-03 16:01:19 -05003766 rbd_dev->header.crypt_type = 0;
3767 rbd_dev->header.comp_type = 0;
3768
3769 /* Get the snapshot context, plus the header version */
3770
3771 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003772 if (ret)
3773 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003774 rbd_dev->header.obj_version = ver;
3775
Alex Eldera30b71b2012-07-10 20:30:11 -05003776 rbd_dev->image_format = 2;
3777
3778 dout("discovered version 2 image, header name is %s\n",
3779 rbd_dev->header_name);
3780
Alex Elder35152972012-08-31 17:29:55 -05003781 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003782out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003783 rbd_dev->parent_overlap = 0;
3784 rbd_spec_put(rbd_dev->parent_spec);
3785 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003786 kfree(rbd_dev->header_name);
3787 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003788 kfree(rbd_dev->header.object_prefix);
3789 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003790
3791 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003792}
3793
Alex Elder83a06262012-10-30 15:47:17 -05003794static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3795{
3796 int ret;
3797
3798 /* no need to lock here, as rbd_dev is not registered yet */
3799 ret = rbd_dev_snaps_update(rbd_dev);
3800 if (ret)
3801 return ret;
3802
Alex Elder9e15b772012-10-30 19:40:33 -05003803 ret = rbd_dev_probe_update_spec(rbd_dev);
3804 if (ret)
3805 goto err_out_snaps;
3806
Alex Elder83a06262012-10-30 15:47:17 -05003807 ret = rbd_dev_set_mapping(rbd_dev);
3808 if (ret)
3809 goto err_out_snaps;
3810
3811 /* generate unique id: find highest unique id, add one */
3812 rbd_dev_id_get(rbd_dev);
3813
3814 /* Fill in the device name, now that we have its id. */
3815 BUILD_BUG_ON(DEV_NAME_LEN
3816 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3817 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3818
3819 /* Get our block major device number. */
3820
3821 ret = register_blkdev(0, rbd_dev->name);
3822 if (ret < 0)
3823 goto err_out_id;
3824 rbd_dev->major = ret;
3825
3826 /* Set up the blkdev mapping. */
3827
3828 ret = rbd_init_disk(rbd_dev);
3829 if (ret)
3830 goto err_out_blkdev;
3831
3832 ret = rbd_bus_add_dev(rbd_dev);
3833 if (ret)
3834 goto err_out_disk;
3835
3836 /*
3837 * At this point cleanup in the event of an error is the job
3838 * of the sysfs code (initiated by rbd_bus_del_dev()).
3839 */
3840 down_write(&rbd_dev->header_rwsem);
3841 ret = rbd_dev_snaps_register(rbd_dev);
3842 up_write(&rbd_dev->header_rwsem);
3843 if (ret)
3844 goto err_out_bus;
3845
Alex Elder9969ebc2013-01-18 12:31:10 -06003846 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05003847 if (ret)
3848 goto err_out_bus;
3849
3850 /* Everything's ready. Announce the disk to the world. */
3851
3852 add_disk(rbd_dev->disk);
3853
3854 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3855 (unsigned long long) rbd_dev->mapping.size);
3856
3857 return ret;
3858err_out_bus:
3859 /* this will also clean up rest of rbd_dev stuff */
3860
3861 rbd_bus_del_dev(rbd_dev);
3862
3863 return ret;
3864err_out_disk:
3865 rbd_free_disk(rbd_dev);
3866err_out_blkdev:
3867 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3868err_out_id:
3869 rbd_dev_id_put(rbd_dev);
3870err_out_snaps:
3871 rbd_remove_all_snaps(rbd_dev);
3872
3873 return ret;
3874}
3875
Alex Eldera30b71b2012-07-10 20:30:11 -05003876/*
3877 * Probe for the existence of the header object for the given rbd
3878 * device. For format 2 images this includes determining the image
3879 * id.
3880 */
3881static int rbd_dev_probe(struct rbd_device *rbd_dev)
3882{
3883 int ret;
3884
3885 /*
3886 * Get the id from the image id object. If it's not a
3887 * format 2 image, we'll get ENOENT back, and we'll assume
3888 * it's a format 1 image.
3889 */
3890 ret = rbd_dev_image_id(rbd_dev);
3891 if (ret)
3892 ret = rbd_dev_v1_probe(rbd_dev);
3893 else
3894 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003895 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003896 dout("probe failed, returning %d\n", ret);
3897
Alex Elder83a06262012-10-30 15:47:17 -05003898 return ret;
3899 }
3900
3901 ret = rbd_dev_probe_finish(rbd_dev);
3902 if (ret)
3903 rbd_header_free(&rbd_dev->header);
3904
Alex Eldera30b71b2012-07-10 20:30:11 -05003905 return ret;
3906}
3907
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003908static ssize_t rbd_add(struct bus_type *bus,
3909 const char *buf,
3910 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003911{
Alex Eldercb8627c2012-07-09 21:04:23 -05003912 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003913 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003914 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003915 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003916 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003917 struct ceph_osd_client *osdc;
3918 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003919
3920 if (!try_module_get(THIS_MODULE))
3921 return -ENODEV;
3922
Alex Eldera725f65e2012-02-02 08:13:30 -06003923 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003924 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003925 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003926 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003927
Alex Elder9d3997f2012-10-25 23:34:42 -05003928 rbdc = rbd_get_client(ceph_opts);
3929 if (IS_ERR(rbdc)) {
3930 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003931 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003932 }
Alex Elderc53d5892012-10-25 23:34:42 -05003933 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003934
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003935 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003936 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003937 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003938 if (rc < 0)
3939 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003940 spec->pool_id = (u64) rc;
3941
Alex Elder0903e872012-11-14 12:25:19 -06003942 /* The ceph file layout needs to fit pool id in 32 bits */
3943
3944 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
3945 rc = -EIO;
3946 goto err_out_client;
3947 }
3948
Alex Elderc53d5892012-10-25 23:34:42 -05003949 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003950 if (!rbd_dev)
3951 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05003952 rbdc = NULL; /* rbd_dev now owns this */
3953 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003954
Alex Elderbd4ba652012-10-25 23:34:42 -05003955 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05003956 kfree(rbd_opts);
3957 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05003958
Alex Eldera30b71b2012-07-10 20:30:11 -05003959 rc = rbd_dev_probe(rbd_dev);
3960 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05003961 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05003962
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003963 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05003964err_out_rbd_dev:
3965 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05003966err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05003967 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003968err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003969 if (ceph_opts)
3970 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003971 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003972 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003973err_out_module:
3974 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003975
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003976 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06003977
3978 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003979}
3980
Alex Elderde71a292012-07-03 16:01:19 -05003981static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003982{
3983 struct list_head *tmp;
3984 struct rbd_device *rbd_dev;
3985
Alex Eldere124a822012-01-29 13:57:44 -06003986 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003987 list_for_each(tmp, &rbd_dev_list) {
3988 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003989 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003990 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003991 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003992 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003993 }
Alex Eldere124a822012-01-29 13:57:44 -06003994 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003995 return NULL;
3996}
3997
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003998static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003999{
Alex Elder593a9e72012-02-07 12:03:37 -06004000 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004001
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004002 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004003 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004004
4005 /* clean up and free blkdev */
4006 rbd_free_disk(rbd_dev);
4007 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004008
Alex Elder2ac4e752012-07-10 20:30:10 -05004009 /* release allocated disk header fields */
4010 rbd_header_free(&rbd_dev->header);
4011
Alex Elder32eec682012-02-08 16:11:14 -06004012 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004013 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004014 rbd_assert(rbd_dev->rbd_client != NULL);
4015 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004016
4017 /* release module ref */
4018 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004019}
4020
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004021static ssize_t rbd_remove(struct bus_type *bus,
4022 const char *buf,
4023 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004024{
4025 struct rbd_device *rbd_dev = NULL;
4026 int target_id, rc;
4027 unsigned long ul;
4028 int ret = count;
4029
4030 rc = strict_strtoul(buf, 10, &ul);
4031 if (rc)
4032 return rc;
4033
4034 /* convert to int; abort if we lost anything in the conversion */
4035 target_id = (int) ul;
4036 if (target_id != ul)
4037 return -EINVAL;
4038
4039 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4040
4041 rbd_dev = __rbd_get_dev(target_id);
4042 if (!rbd_dev) {
4043 ret = -ENOENT;
4044 goto done;
4045 }
4046
Alex Elder42382b72012-11-16 09:29:16 -06004047 if (rbd_dev->open_count) {
4048 ret = -EBUSY;
4049 goto done;
4050 }
4051
Alex Elder41f38c22012-10-25 23:34:40 -05004052 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004053 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004054
4055done:
4056 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05004057
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004058 return ret;
4059}
4060
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004061/*
4062 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004063 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004064 */
4065static int rbd_sysfs_init(void)
4066{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004067 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004068
Alex Elderfed4c142012-02-07 12:03:36 -06004069 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004070 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004071 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004072
Alex Elderfed4c142012-02-07 12:03:36 -06004073 ret = bus_register(&rbd_bus_type);
4074 if (ret < 0)
4075 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004076
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004077 return ret;
4078}
4079
4080static void rbd_sysfs_cleanup(void)
4081{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004082 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06004083 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004084}
4085
4086int __init rbd_init(void)
4087{
4088 int rc;
4089
4090 rc = rbd_sysfs_init();
4091 if (rc)
4092 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06004093 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004094 return 0;
4095}
4096
4097void __exit rbd_exit(void)
4098{
4099 rbd_sysfs_cleanup();
4100}
4101
4102module_init(rbd_init);
4103module_exit(rbd_exit);
4104
4105MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4106MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4107MODULE_DESCRIPTION("rados block device");
4108
4109/* following authorship retained from original osdblk.c */
4110MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4111
4112MODULE_LICENSE("GPL");