blob: fd9656b5fdb97ba976e8881c6e8dfa6cee8a2b0e [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
Alex Elderbf0d5f502012-11-22 00:00:08 -0600165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
Alex Elder9969ebc2013-01-18 12:31:10 -0600173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600176
177struct rbd_obj_request {
178 const char *object_name;
179 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */
181
182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
199 s32 result;
200 atomic_t done;
201
202 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600203 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204
205 struct kref kref;
206};
207
208struct rbd_img_request {
209 struct request *rq;
210 struct rbd_device *rbd_dev;
211 u64 offset; /* starting image byte offset */
212 u64 length; /* byte count from offset */
213 bool write_request; /* false for read */
214 union {
215 struct ceph_snap_context *snapc; /* for writes */
216 u64 snap_id; /* for reads */
217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
226};
227
228#define for_each_obj_request(ireq, oreq) \
229 list_for_each_entry(oreq, &ireq->obj_requests, links)
230#define for_each_obj_request_from(ireq, oreq) \
231 list_for_each_entry_from(oreq, &ireq->obj_requests, links)
232#define for_each_obj_request_safe(ireq, oreq, n) \
233 list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235struct rbd_snap {
236 struct device dev;
237 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800238 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239 struct list_head node;
240 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500241 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800242};
243
Alex Elderf84344f2012-08-31 17:29:51 -0500244struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500245 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500246 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500247 bool read_only;
248};
249
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250/*
251 * a single device
252 */
253struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500254 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255
256 int major; /* blkdev assigned major */
257 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258
Alex Eldera30b71b2012-07-10 20:30:11 -0500259 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260 struct rbd_client *rbd_client;
261
262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263
264 spinlock_t lock; /* queue lock */
265
266 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600267 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500268 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500270 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500271
Alex Elder0903e872012-11-14 12:25:19 -0600272 struct ceph_file_layout layout;
273
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700274 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600275 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700276
Alex Elder86b00e02012-10-25 23:34:42 -0500277 struct rbd_spec *parent_spec;
278 u64 parent_overlap;
279
Josh Durginc6666012011-11-21 17:11:12 -0800280 /* protects updating the header */
281 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500282
283 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284
285 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800286
287 /* list of snapshots */
288 struct list_head snaps;
289
290 /* sysfs related */
291 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600292 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800293};
294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600296
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600298static DEFINE_SPINLOCK(rbd_dev_list_lock);
299
Alex Elder432b8582012-01-29 13:57:44 -0600300static LIST_HEAD(rbd_client_list); /* clients */
301static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302
Alex Elder304f6802012-08-31 17:29:52 -0500303static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
304static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
305
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800306static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500307static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800308
Alex Elderf0f8cef2012-01-29 13:57:44 -0600309static ssize_t rbd_add(struct bus_type *bus, const char *buf,
310 size_t count);
311static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
312 size_t count);
313
314static struct bus_attribute rbd_bus_attrs[] = {
315 __ATTR(add, S_IWUSR, NULL, rbd_add),
316 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
317 __ATTR_NULL
318};
319
320static struct bus_type rbd_bus_type = {
321 .name = "rbd",
322 .bus_attrs = rbd_bus_attrs,
323};
324
325static void rbd_root_dev_release(struct device *dev)
326{
327}
328
329static struct device rbd_root_dev = {
330 .init_name = "rbd",
331 .release = rbd_root_dev_release,
332};
333
Alex Elder06ecc6c2012-11-01 10:17:15 -0500334static __printf(2, 3)
335void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
336{
337 struct va_format vaf;
338 va_list args;
339
340 va_start(args, fmt);
341 vaf.fmt = fmt;
342 vaf.va = &args;
343
344 if (!rbd_dev)
345 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
346 else if (rbd_dev->disk)
347 printk(KERN_WARNING "%s: %s: %pV\n",
348 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
349 else if (rbd_dev->spec && rbd_dev->spec->image_name)
350 printk(KERN_WARNING "%s: image %s: %pV\n",
351 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
352 else if (rbd_dev->spec && rbd_dev->spec->image_id)
353 printk(KERN_WARNING "%s: id %s: %pV\n",
354 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
355 else /* punt */
356 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
357 RBD_DRV_NAME, rbd_dev, &vaf);
358 va_end(args);
359}
360
Alex Elderaafb230e2012-09-06 16:00:54 -0500361#ifdef RBD_DEBUG
362#define rbd_assert(expr) \
363 if (unlikely(!(expr))) { \
364 printk(KERN_ERR "\nAssertion failure in %s() " \
365 "at line %d:\n\n" \
366 "\trbd_assert(%s);\n\n", \
367 __func__, __LINE__, #expr); \
368 BUG(); \
369 }
370#else /* !RBD_DEBUG */
371# define rbd_assert(expr) ((void) 0)
372#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800373
Alex Elder117973f2012-08-31 17:29:55 -0500374static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
375static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700376
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377static int rbd_open(struct block_device *bdev, fmode_t mode)
378{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600379 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380
Alex Elderf84344f2012-08-31 17:29:51 -0500381 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382 return -EROFS;
383
Alex Elder42382b72012-11-16 09:29:16 -0600384 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600385 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500386 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600387 rbd_dev->open_count++;
388 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700389
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700390 return 0;
391}
392
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800393static int rbd_release(struct gendisk *disk, fmode_t mode)
394{
395 struct rbd_device *rbd_dev = disk->private_data;
396
Alex Elder42382b72012-11-16 09:29:16 -0600397 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
398 rbd_assert(rbd_dev->open_count > 0);
399 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600400 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600401 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800402
403 return 0;
404}
405
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406static const struct block_device_operations rbd_bd_ops = {
407 .owner = THIS_MODULE,
408 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800409 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410};
411
412/*
413 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500414 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700415 */
Alex Elderf8c38922012-08-10 13:12:07 -0700416static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417{
418 struct rbd_client *rbdc;
419 int ret = -ENOMEM;
420
421 dout("rbd_client_create\n");
422 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
423 if (!rbdc)
424 goto out_opt;
425
426 kref_init(&rbdc->kref);
427 INIT_LIST_HEAD(&rbdc->node);
428
Alex Elderbc534d82012-01-29 13:57:44 -0600429 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
430
Alex Elder43ae4702012-07-03 16:01:18 -0500431 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600433 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500434 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435
436 ret = ceph_open_session(rbdc->client);
437 if (ret < 0)
438 goto out_err;
439
Alex Elder432b8582012-01-29 13:57:44 -0600440 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600442 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443
Alex Elderbc534d82012-01-29 13:57:44 -0600444 mutex_unlock(&ctl_mutex);
445
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446 dout("rbd_client_create created %p\n", rbdc);
447 return rbdc;
448
449out_err:
450 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600451out_mutex:
452 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 kfree(rbdc);
454out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500455 if (ceph_opts)
456 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400457 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458}
459
460/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700461 * Find a ceph client with specific addr and configuration. If
462 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700464static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465{
466 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700467 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468
Alex Elder43ae4702012-07-03 16:01:18 -0500469 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470 return NULL;
471
Alex Elder1f7ba332012-08-10 13:12:07 -0700472 spin_lock(&rbd_client_list_lock);
473 list_for_each_entry(client_node, &rbd_client_list, node) {
474 if (!ceph_compare_options(ceph_opts, client_node->client)) {
475 kref_get(&client_node->kref);
476 found = true;
477 break;
478 }
479 }
480 spin_unlock(&rbd_client_list_lock);
481
482 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483}
484
485/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700486 * mount options
487 */
488enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700489 Opt_last_int,
490 /* int args above */
491 Opt_last_string,
492 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700493 Opt_read_only,
494 Opt_read_write,
495 /* Boolean args above */
496 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700497};
498
Alex Elder43ae4702012-07-03 16:01:18 -0500499static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700500 /* int args above */
501 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500502 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700503 {Opt_read_only, "ro"}, /* Alternate spelling */
504 {Opt_read_write, "read_write"},
505 {Opt_read_write, "rw"}, /* Alternate spelling */
506 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700507 {-1, NULL}
508};
509
Alex Elder98571b52013-01-20 14:44:42 -0600510struct rbd_options {
511 bool read_only;
512};
513
514#define RBD_READ_ONLY_DEFAULT false
515
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700516static int parse_rbd_opts_token(char *c, void *private)
517{
Alex Elder43ae4702012-07-03 16:01:18 -0500518 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700519 substring_t argstr[MAX_OPT_ARGS];
520 int token, intval, ret;
521
Alex Elder43ae4702012-07-03 16:01:18 -0500522 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700523 if (token < 0)
524 return -EINVAL;
525
526 if (token < Opt_last_int) {
527 ret = match_int(&argstr[0], &intval);
528 if (ret < 0) {
529 pr_err("bad mount option arg (not int) "
530 "at '%s'\n", c);
531 return ret;
532 }
533 dout("got int token %d val %d\n", token, intval);
534 } else if (token > Opt_last_int && token < Opt_last_string) {
535 dout("got string token %d val %s\n", token,
536 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700537 } else if (token > Opt_last_string && token < Opt_last_bool) {
538 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700539 } else {
540 dout("got token %d\n", token);
541 }
542
543 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700544 case Opt_read_only:
545 rbd_opts->read_only = true;
546 break;
547 case Opt_read_write:
548 rbd_opts->read_only = false;
549 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700550 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500551 rbd_assert(false);
552 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700553 }
554 return 0;
555}
556
557/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558 * Get a ceph client with specific addr and configuration, if one does
559 * not exist create it.
560 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500561static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562{
Alex Elderf8c38922012-08-10 13:12:07 -0700563 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700564
Alex Elder1f7ba332012-08-10 13:12:07 -0700565 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500566 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500567 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500568 else
Alex Elderf8c38922012-08-10 13:12:07 -0700569 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570
Alex Elder9d3997f2012-10-25 23:34:42 -0500571 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700572}
573
574/*
575 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600576 *
Alex Elder432b8582012-01-29 13:57:44 -0600577 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 */
579static void rbd_client_release(struct kref *kref)
580{
581 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
582
583 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500584 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500586 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587
588 ceph_destroy_client(rbdc->client);
589 kfree(rbdc);
590}
591
592/*
593 * Drop reference to ceph client node. If it's not referenced anymore, release
594 * it.
595 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500596static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597{
Alex Elderc53d5892012-10-25 23:34:42 -0500598 if (rbdc)
599 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600}
601
Alex Eldera30b71b2012-07-10 20:30:11 -0500602static bool rbd_image_format_valid(u32 image_format)
603{
604 return image_format == 1 || image_format == 2;
605}
606
Alex Elder8e94af82012-07-25 09:32:40 -0500607static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
608{
Alex Elder103a1502012-08-02 11:29:45 -0500609 size_t size;
610 u32 snap_count;
611
612 /* The header has to start with the magic rbd header text */
613 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
614 return false;
615
Alex Elderdb2388b2012-10-20 22:17:27 -0500616 /* The bio layer requires at least sector-sized I/O */
617
618 if (ondisk->options.order < SECTOR_SHIFT)
619 return false;
620
621 /* If we use u64 in a few spots we may be able to loosen this */
622
623 if (ondisk->options.order > 8 * sizeof (int) - 1)
624 return false;
625
Alex Elder103a1502012-08-02 11:29:45 -0500626 /*
627 * The size of a snapshot header has to fit in a size_t, and
628 * that limits the number of snapshots.
629 */
630 snap_count = le32_to_cpu(ondisk->snap_count);
631 size = SIZE_MAX - sizeof (struct ceph_snap_context);
632 if (snap_count > size / sizeof (__le64))
633 return false;
634
635 /*
636 * Not only that, but the size of the entire the snapshot
637 * header must also be representable in a size_t.
638 */
639 size -= snap_count * sizeof (__le64);
640 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
641 return false;
642
643 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500644}
645
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646/*
647 * Create a new header structure, translate header format from the on-disk
648 * header.
649 */
650static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500651 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652{
Alex Elderccece232012-07-10 20:30:10 -0500653 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500654 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500655 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500656 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657
Alex Elder6a523252012-07-19 17:12:59 -0500658 memset(header, 0, sizeof (*header));
659
Alex Elder103a1502012-08-02 11:29:45 -0500660 snap_count = le32_to_cpu(ondisk->snap_count);
661
Alex Elder58c17b02012-08-23 23:22:06 -0500662 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
663 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500664 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500666 memcpy(header->object_prefix, ondisk->object_prefix, len);
667 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600668
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500670 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
671
Alex Elder621901d2012-08-23 23:22:06 -0500672 /* Save a copy of the snapshot names */
673
Alex Elderf785cc12012-08-23 23:22:06 -0500674 if (snap_names_len > (u64) SIZE_MAX)
675 return -EIO;
676 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500678 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500679 /*
680 * Note that rbd_dev_v1_header_read() guarantees
681 * the ondisk buffer we're working with has
682 * snap_names_len bytes beyond the end of the
683 * snapshot id array, this memcpy() is safe.
684 */
685 memcpy(header->snap_names, &ondisk->snaps[snap_count],
686 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500687
Alex Elder621901d2012-08-23 23:22:06 -0500688 /* Record each snapshot's size */
689
Alex Elderd2bb24e2012-07-26 23:37:14 -0500690 size = snap_count * sizeof (*header->snap_sizes);
691 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500693 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500694 for (i = 0; i < snap_count; i++)
695 header->snap_sizes[i] =
696 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 } else {
Alex Elderccece232012-07-10 20:30:10 -0500698 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 header->snap_names = NULL;
700 header->snap_sizes = NULL;
701 }
Alex Elder849b4262012-07-09 21:04:24 -0500702
Alex Elder34b13182012-07-13 20:35:12 -0500703 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704 header->obj_order = ondisk->options.order;
705 header->crypt_type = ondisk->options.crypt_type;
706 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500707
Alex Elder621901d2012-08-23 23:22:06 -0500708 /* Allocate and fill in the snapshot context */
709
Alex Elderf84344f2012-08-31 17:29:51 -0500710 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500711 size = sizeof (struct ceph_snap_context);
712 size += snap_count * sizeof (header->snapc->snaps[0]);
713 header->snapc = kzalloc(size, GFP_KERNEL);
714 if (!header->snapc)
715 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
717 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500718 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500720 for (i = 0; i < snap_count; i++)
721 header->snapc->snaps[i] =
722 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723
724 return 0;
725
Alex Elder6a523252012-07-19 17:12:59 -0500726out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500727 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500728 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500730 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500731 kfree(header->object_prefix);
732 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500733
Alex Elder00f1f362012-02-07 12:03:36 -0600734 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735}
736
Alex Elder9e15b772012-10-30 19:40:33 -0500737static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
738{
739 struct rbd_snap *snap;
740
741 if (snap_id == CEPH_NOSNAP)
742 return RBD_SNAP_HEAD_NAME;
743
744 list_for_each_entry(snap, &rbd_dev->snaps, node)
745 if (snap_id == snap->id)
746 return snap->name;
747
748 return NULL;
749}
750
Alex Elder8836b992012-08-30 14:42:15 -0500751static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700752{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753
Alex Eldere86924a2012-07-10 20:30:11 -0500754 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600755
Alex Eldere86924a2012-07-10 20:30:11 -0500756 list_for_each_entry(snap, &rbd_dev->snaps, node) {
757 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500758 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500759 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500760 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600761
Alex Eldere86924a2012-07-10 20:30:11 -0500762 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600763 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700764 }
Alex Eldere86924a2012-07-10 20:30:11 -0500765
Alex Elder00f1f362012-02-07 12:03:36 -0600766 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767}
768
Alex Elder819d52b2012-10-25 23:34:41 -0500769static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770{
Alex Elder78dc4472012-07-19 08:49:18 -0500771 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700772
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500773 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800774 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500775 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500776 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500777 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500778 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500780 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700781 if (ret < 0)
782 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500783 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 }
Alex Elderd78b6502012-11-09 08:43:15 -0600785 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 return ret;
788}
789
790static void rbd_header_free(struct rbd_image_header *header)
791{
Alex Elder849b4262012-07-09 21:04:24 -0500792 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500793 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700794 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500795 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500796 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500797 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800798 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500799 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700800}
801
Alex Elder98571b52013-01-20 14:44:42 -0600802static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803{
Alex Elder65ccfe22012-08-09 10:33:26 -0700804 char *name;
805 u64 segment;
806 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807
Alex Elder2fd82b92012-11-09 15:05:54 -0600808 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700809 if (!name)
810 return NULL;
811 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600812 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700813 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600814 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700815 pr_err("error formatting segment name for #%llu (%d)\n",
816 segment, ret);
817 kfree(name);
818 name = NULL;
819 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820
Alex Elder65ccfe22012-08-09 10:33:26 -0700821 return name;
822}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700823
Alex Elder65ccfe22012-08-09 10:33:26 -0700824static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
825{
826 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700827
Alex Elder65ccfe22012-08-09 10:33:26 -0700828 return offset & (segment_size - 1);
829}
830
831static u64 rbd_segment_length(struct rbd_device *rbd_dev,
832 u64 offset, u64 length)
833{
834 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
835
836 offset &= segment_size - 1;
837
Alex Elderaafb230e2012-09-06 16:00:54 -0500838 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700839 if (offset + length > segment_size)
840 length = segment_size - offset;
841
842 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700843}
844
845/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700846 * returns the size of an object in the image
847 */
848static u64 rbd_obj_bytes(struct rbd_image_header *header)
849{
850 return 1 << header->obj_order;
851}
852
853/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854 * bio helpers
855 */
856
857static void bio_chain_put(struct bio *chain)
858{
859 struct bio *tmp;
860
861 while (chain) {
862 tmp = chain;
863 chain = chain->bi_next;
864 bio_put(tmp);
865 }
866}
867
868/*
869 * zeros a bio chain, starting at specific offset
870 */
871static void zero_bio_chain(struct bio *chain, int start_ofs)
872{
873 struct bio_vec *bv;
874 unsigned long flags;
875 void *buf;
876 int i;
877 int pos = 0;
878
879 while (chain) {
880 bio_for_each_segment(bv, chain, i) {
881 if (pos + bv->bv_len > start_ofs) {
882 int remainder = max(start_ofs - pos, 0);
883 buf = bvec_kmap_irq(bv, &flags);
884 memset(buf + remainder, 0,
885 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200886 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 }
888 pos += bv->bv_len;
889 }
890
891 chain = chain->bi_next;
892 }
893}
894
895/*
Alex Elderf7760da2012-10-20 22:17:27 -0500896 * Clone a portion of a bio, starting at the given byte offset
897 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898 */
Alex Elderf7760da2012-10-20 22:17:27 -0500899static struct bio *bio_clone_range(struct bio *bio_src,
900 unsigned int offset,
901 unsigned int len,
902 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903{
Alex Elderf7760da2012-10-20 22:17:27 -0500904 struct bio_vec *bv;
905 unsigned int resid;
906 unsigned short idx;
907 unsigned int voff;
908 unsigned short end_idx;
909 unsigned short vcnt;
910 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911
Alex Elderf7760da2012-10-20 22:17:27 -0500912 /* Handle the easy case for the caller */
913
914 if (!offset && len == bio_src->bi_size)
915 return bio_clone(bio_src, gfpmask);
916
917 if (WARN_ON_ONCE(!len))
918 return NULL;
919 if (WARN_ON_ONCE(len > bio_src->bi_size))
920 return NULL;
921 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
922 return NULL;
923
924 /* Find first affected segment... */
925
926 resid = offset;
927 __bio_for_each_segment(bv, bio_src, idx, 0) {
928 if (resid < bv->bv_len)
929 break;
930 resid -= bv->bv_len;
931 }
932 voff = resid;
933
934 /* ...and the last affected segment */
935
936 resid += len;
937 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
938 if (resid <= bv->bv_len)
939 break;
940 resid -= bv->bv_len;
941 }
942 vcnt = end_idx - idx + 1;
943
944 /* Build the clone */
945
946 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
947 if (!bio)
948 return NULL; /* ENOMEM */
949
950 bio->bi_bdev = bio_src->bi_bdev;
951 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
952 bio->bi_rw = bio_src->bi_rw;
953 bio->bi_flags |= 1 << BIO_CLONED;
954
955 /*
956 * Copy over our part of the bio_vec, then update the first
957 * and last (or only) entries.
958 */
959 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
960 vcnt * sizeof (struct bio_vec));
961 bio->bi_io_vec[0].bv_offset += voff;
962 if (vcnt > 1) {
963 bio->bi_io_vec[0].bv_len -= voff;
964 bio->bi_io_vec[vcnt - 1].bv_len = resid;
965 } else {
966 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 }
968
Alex Elderf7760da2012-10-20 22:17:27 -0500969 bio->bi_vcnt = vcnt;
970 bio->bi_size = len;
971 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700972
Alex Elderf7760da2012-10-20 22:17:27 -0500973 return bio;
974}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Alex Elderf7760da2012-10-20 22:17:27 -0500976/*
977 * Clone a portion of a bio chain, starting at the given byte offset
978 * into the first bio in the source chain and continuing for the
979 * number of bytes indicated. The result is another bio chain of
980 * exactly the given length, or a null pointer on error.
981 *
982 * The bio_src and offset parameters are both in-out. On entry they
983 * refer to the first source bio and the offset into that bio where
984 * the start of data to be cloned is located.
985 *
986 * On return, bio_src is updated to refer to the bio in the source
987 * chain that contains first un-cloned byte, and *offset will
988 * contain the offset of that byte within that bio.
989 */
990static struct bio *bio_chain_clone_range(struct bio **bio_src,
991 unsigned int *offset,
992 unsigned int len,
993 gfp_t gfpmask)
994{
995 struct bio *bi = *bio_src;
996 unsigned int off = *offset;
997 struct bio *chain = NULL;
998 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Alex Elderf7760da2012-10-20 22:17:27 -05001000 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001001
Alex Elderf7760da2012-10-20 22:17:27 -05001002 if (!bi || off >= bi->bi_size || !len)
1003 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004
Alex Elderf7760da2012-10-20 22:17:27 -05001005 end = &chain;
1006 while (len) {
1007 unsigned int bi_size;
1008 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001009
Alex Elderf5400b72012-11-01 10:17:15 -05001010 if (!bi) {
1011 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001012 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001013 }
Alex Elderf7760da2012-10-20 22:17:27 -05001014 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1015 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1016 if (!bio)
1017 goto out_err; /* ENOMEM */
1018
1019 *end = bio;
1020 end = &bio->bi_next;
1021
1022 off += bi_size;
1023 if (off == bi->bi_size) {
1024 bi = bi->bi_next;
1025 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026 }
Alex Elderf7760da2012-10-20 22:17:27 -05001027 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028 }
Alex Elderf7760da2012-10-20 22:17:27 -05001029 *bio_src = bi;
1030 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031
Alex Elderf7760da2012-10-20 22:17:27 -05001032 return chain;
1033out_err:
1034 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036 return NULL;
1037}
1038
Alex Elderbf0d5f502012-11-22 00:00:08 -06001039static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1040{
1041 kref_get(&obj_request->kref);
1042}
1043
1044static void rbd_obj_request_destroy(struct kref *kref);
1045static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1046{
1047 rbd_assert(obj_request != NULL);
1048 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1049}
1050
1051static void rbd_img_request_get(struct rbd_img_request *img_request)
1052{
1053 kref_get(&img_request->kref);
1054}
1055
1056static void rbd_img_request_destroy(struct kref *kref);
1057static void rbd_img_request_put(struct rbd_img_request *img_request)
1058{
1059 rbd_assert(img_request != NULL);
1060 kref_put(&img_request->kref, rbd_img_request_destroy);
1061}
1062
1063static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1064 struct rbd_obj_request *obj_request)
1065{
Alex Elder25dcf952013-01-25 17:08:55 -06001066 rbd_assert(obj_request->img_request == NULL);
1067
Alex Elderbf0d5f502012-11-22 00:00:08 -06001068 rbd_obj_request_get(obj_request);
1069 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001070 obj_request->which = img_request->obj_request_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001071 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001072 img_request->obj_request_count++;
1073 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001074}
1075
1076static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1077 struct rbd_obj_request *obj_request)
1078{
1079 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001080
Alex Elderbf0d5f502012-11-22 00:00:08 -06001081 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001082 rbd_assert(img_request->obj_request_count > 0);
1083 img_request->obj_request_count--;
1084 rbd_assert(obj_request->which == img_request->obj_request_count);
1085 obj_request->which = BAD_WHICH;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001086 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001087 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001088 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001089 rbd_obj_request_put(obj_request);
1090}
1091
1092static bool obj_request_type_valid(enum obj_request_type type)
1093{
1094 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001095 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001096 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001097 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001098 return true;
1099 default:
1100 return false;
1101 }
1102}
1103
Alex Elder8d23bf22012-11-19 22:55:21 -06001104struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1105{
1106 struct ceph_osd_req_op *op;
1107 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001108 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001109
1110 op = kzalloc(sizeof (*op), GFP_NOIO);
1111 if (!op)
1112 return NULL;
1113 op->op = opcode;
1114 va_start(args, opcode);
1115 switch (opcode) {
1116 case CEPH_OSD_OP_READ:
1117 case CEPH_OSD_OP_WRITE:
1118 /* rbd_osd_req_op_create(READ, offset, length) */
1119 /* rbd_osd_req_op_create(WRITE, offset, length) */
1120 op->extent.offset = va_arg(args, u64);
1121 op->extent.length = va_arg(args, u64);
1122 if (opcode == CEPH_OSD_OP_WRITE)
1123 op->payload_len = op->extent.length;
1124 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001125 case CEPH_OSD_OP_CALL:
1126 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1127 op->cls.class_name = va_arg(args, char *);
1128 size = strlen(op->cls.class_name);
1129 rbd_assert(size <= (size_t) U8_MAX);
1130 op->cls.class_len = size;
1131 op->payload_len = size;
1132
1133 op->cls.method_name = va_arg(args, char *);
1134 size = strlen(op->cls.method_name);
1135 rbd_assert(size <= (size_t) U8_MAX);
1136 op->cls.method_len = size;
1137 op->payload_len += size;
1138
1139 op->cls.argc = 0;
1140 op->cls.indata = va_arg(args, void *);
1141 size = va_arg(args, size_t);
1142 rbd_assert(size <= (size_t) U32_MAX);
1143 op->cls.indata_len = (u32) size;
1144 op->payload_len += size;
1145 break;
Alex Elder5efea492012-11-19 22:55:21 -06001146 case CEPH_OSD_OP_NOTIFY_ACK:
1147 case CEPH_OSD_OP_WATCH:
1148 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1149 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1150 op->watch.cookie = va_arg(args, u64);
1151 op->watch.ver = va_arg(args, u64);
1152 op->watch.ver = cpu_to_le64(op->watch.ver);
1153 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1154 op->watch.flag = (u8) 1;
1155 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001156 default:
1157 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1158 kfree(op);
1159 op = NULL;
1160 break;
1161 }
1162 va_end(args);
1163
1164 return op;
1165}
1166
1167static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1168{
1169 kfree(op);
1170}
1171
Alex Elderbf0d5f502012-11-22 00:00:08 -06001172static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1173 struct rbd_obj_request *obj_request)
1174{
1175 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1176}
1177
1178static void rbd_img_request_complete(struct rbd_img_request *img_request)
1179{
1180 if (img_request->callback)
1181 img_request->callback(img_request);
1182 else
1183 rbd_img_request_put(img_request);
1184}
1185
Alex Elder788e2df2013-01-17 12:25:27 -06001186/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1187
1188static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1189{
1190 return wait_for_completion_interruptible(&obj_request->completion);
1191}
1192
Alex Elder9969ebc2013-01-18 12:31:10 -06001193static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
1194 struct ceph_osd_op *op)
1195{
1196 atomic_set(&obj_request->done, 1);
1197}
1198
Alex Elderbf0d5f502012-11-22 00:00:08 -06001199static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1200{
1201 if (obj_request->callback)
1202 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001203 else
1204 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001205}
1206
Alex Elderbf0d5f502012-11-22 00:00:08 -06001207static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1208 struct ceph_osd_op *op)
1209{
1210 u64 xferred;
1211
1212 /*
1213 * We support a 64-bit length, but ultimately it has to be
1214 * passed to blk_end_request(), which takes an unsigned int.
1215 */
1216 xferred = le64_to_cpu(op->extent.length);
1217 rbd_assert(xferred < (u64) UINT_MAX);
1218 if (obj_request->result == (s32) -ENOENT) {
1219 zero_bio_chain(obj_request->bio_list, 0);
1220 obj_request->result = 0;
1221 } else if (xferred < obj_request->length && !obj_request->result) {
1222 zero_bio_chain(obj_request->bio_list, xferred);
1223 xferred = obj_request->length;
1224 }
1225 obj_request->xferred = xferred;
1226 atomic_set(&obj_request->done, 1);
1227}
1228
1229static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1230 struct ceph_osd_op *op)
1231{
1232 obj_request->xferred = le64_to_cpu(op->extent.length);
1233 atomic_set(&obj_request->done, 1);
1234}
1235
1236static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1237 struct ceph_msg *msg)
1238{
1239 struct rbd_obj_request *obj_request = osd_req->r_priv;
1240 struct ceph_osd_reply_head *reply_head;
1241 struct ceph_osd_op *op;
1242 u32 num_ops;
1243 u16 opcode;
1244
1245 rbd_assert(osd_req == obj_request->osd_req);
1246 rbd_assert(!!obj_request->img_request ^
1247 (obj_request->which == BAD_WHICH));
1248
1249 obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1250 reply_head = msg->front.iov_base;
1251 obj_request->result = (s32) le32_to_cpu(reply_head->result);
1252 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1253
1254 num_ops = le32_to_cpu(reply_head->num_ops);
1255 WARN_ON(num_ops != 1); /* For now */
1256
1257 op = &reply_head->ops[0];
1258 opcode = le16_to_cpu(op->op);
1259 switch (opcode) {
1260 case CEPH_OSD_OP_READ:
1261 rbd_osd_read_callback(obj_request, op);
1262 break;
1263 case CEPH_OSD_OP_WRITE:
1264 rbd_osd_write_callback(obj_request, op);
1265 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001266 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001267 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001268 case CEPH_OSD_OP_WATCH:
1269 rbd_osd_trivial_callback(obj_request, op);
1270 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001271 default:
1272 rbd_warn(NULL, "%s: unsupported op %hu\n",
1273 obj_request->object_name, (unsigned short) opcode);
1274 break;
1275 }
1276
1277 if (atomic_read(&obj_request->done))
1278 rbd_obj_request_complete(obj_request);
1279}
1280
1281static struct ceph_osd_request *rbd_osd_req_create(
1282 struct rbd_device *rbd_dev,
1283 bool write_request,
1284 struct rbd_obj_request *obj_request,
1285 struct ceph_osd_req_op *op)
1286{
1287 struct rbd_img_request *img_request = obj_request->img_request;
1288 struct ceph_snap_context *snapc = NULL;
1289 struct ceph_osd_client *osdc;
1290 struct ceph_osd_request *osd_req;
1291 struct timespec now;
1292 struct timespec *mtime;
1293 u64 snap_id = CEPH_NOSNAP;
1294 u64 offset = obj_request->offset;
1295 u64 length = obj_request->length;
1296
1297 if (img_request) {
1298 rbd_assert(img_request->write_request == write_request);
1299 if (img_request->write_request)
1300 snapc = img_request->snapc;
1301 else
1302 snap_id = img_request->snap_id;
1303 }
1304
1305 /* Allocate and initialize the request, for the single op */
1306
1307 osdc = &rbd_dev->rbd_client->client->osdc;
1308 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1309 if (!osd_req)
1310 return NULL; /* ENOMEM */
1311
1312 rbd_assert(obj_request_type_valid(obj_request->type));
1313 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001314 case OBJ_REQUEST_NODATA:
1315 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001316 case OBJ_REQUEST_BIO:
1317 rbd_assert(obj_request->bio_list != NULL);
1318 osd_req->r_bio = obj_request->bio_list;
1319 bio_get(osd_req->r_bio);
1320 /* osd client requires "num pages" even for bio */
1321 osd_req->r_num_pages = calc_pages_for(offset, length);
1322 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001323 case OBJ_REQUEST_PAGES:
1324 osd_req->r_pages = obj_request->pages;
1325 osd_req->r_num_pages = obj_request->page_count;
1326 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1327 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001328 }
1329
1330 if (write_request) {
1331 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1332 now = CURRENT_TIME;
1333 mtime = &now;
1334 } else {
1335 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1336 mtime = NULL; /* not needed for reads */
1337 offset = 0; /* These are not used... */
1338 length = 0; /* ...for osd read requests */
1339 }
1340
1341 osd_req->r_callback = rbd_osd_req_callback;
1342 osd_req->r_priv = obj_request;
1343
1344 osd_req->r_oid_len = strlen(obj_request->object_name);
1345 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1346 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1347
1348 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1349
1350 /* osd_req will get its own reference to snapc (if non-null) */
1351
1352 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1353 snapc, snap_id, mtime);
1354
1355 return osd_req;
1356}
1357
1358static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1359{
1360 ceph_osdc_put_request(osd_req);
1361}
1362
1363/* object_name is assumed to be a non-null pointer and NUL-terminated */
1364
1365static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1366 u64 offset, u64 length,
1367 enum obj_request_type type)
1368{
1369 struct rbd_obj_request *obj_request;
1370 size_t size;
1371 char *name;
1372
1373 rbd_assert(obj_request_type_valid(type));
1374
1375 size = strlen(object_name) + 1;
1376 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1377 if (!obj_request)
1378 return NULL;
1379
1380 name = (char *)(obj_request + 1);
1381 obj_request->object_name = memcpy(name, object_name, size);
1382 obj_request->offset = offset;
1383 obj_request->length = length;
1384 obj_request->which = BAD_WHICH;
1385 obj_request->type = type;
1386 INIT_LIST_HEAD(&obj_request->links);
1387 atomic_set(&obj_request->done, 0);
Alex Elder788e2df2013-01-17 12:25:27 -06001388 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001389 kref_init(&obj_request->kref);
1390
1391 return obj_request;
1392}
1393
1394static void rbd_obj_request_destroy(struct kref *kref)
1395{
1396 struct rbd_obj_request *obj_request;
1397
1398 obj_request = container_of(kref, struct rbd_obj_request, kref);
1399
1400 rbd_assert(obj_request->img_request == NULL);
1401 rbd_assert(obj_request->which == BAD_WHICH);
1402
1403 if (obj_request->osd_req)
1404 rbd_osd_req_destroy(obj_request->osd_req);
1405
1406 rbd_assert(obj_request_type_valid(obj_request->type));
1407 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001408 case OBJ_REQUEST_NODATA:
1409 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001410 case OBJ_REQUEST_BIO:
1411 if (obj_request->bio_list)
1412 bio_chain_put(obj_request->bio_list);
1413 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001414 case OBJ_REQUEST_PAGES:
1415 if (obj_request->pages)
1416 ceph_release_page_vector(obj_request->pages,
1417 obj_request->page_count);
1418 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001419 }
1420
1421 kfree(obj_request);
1422}
1423
1424/*
1425 * Caller is responsible for filling in the list of object requests
1426 * that comprises the image request, and the Linux request pointer
1427 * (if there is one).
1428 */
1429struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1430 u64 offset, u64 length,
1431 bool write_request)
1432{
1433 struct rbd_img_request *img_request;
1434 struct ceph_snap_context *snapc = NULL;
1435
1436 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1437 if (!img_request)
1438 return NULL;
1439
1440 if (write_request) {
1441 down_read(&rbd_dev->header_rwsem);
1442 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1443 up_read(&rbd_dev->header_rwsem);
1444 if (WARN_ON(!snapc)) {
1445 kfree(img_request);
1446 return NULL; /* Shouldn't happen */
1447 }
1448 }
1449
1450 img_request->rq = NULL;
1451 img_request->rbd_dev = rbd_dev;
1452 img_request->offset = offset;
1453 img_request->length = length;
1454 img_request->write_request = write_request;
1455 if (write_request)
1456 img_request->snapc = snapc;
1457 else
1458 img_request->snap_id = rbd_dev->spec->snap_id;
1459 spin_lock_init(&img_request->completion_lock);
1460 img_request->next_completion = 0;
1461 img_request->callback = NULL;
1462 img_request->obj_request_count = 0;
1463 INIT_LIST_HEAD(&img_request->obj_requests);
1464 kref_init(&img_request->kref);
1465
1466 rbd_img_request_get(img_request); /* Avoid a warning */
1467 rbd_img_request_put(img_request); /* TEMPORARY */
1468
1469 return img_request;
1470}
1471
1472static void rbd_img_request_destroy(struct kref *kref)
1473{
1474 struct rbd_img_request *img_request;
1475 struct rbd_obj_request *obj_request;
1476 struct rbd_obj_request *next_obj_request;
1477
1478 img_request = container_of(kref, struct rbd_img_request, kref);
1479
1480 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1481 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001482 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001483
1484 if (img_request->write_request)
1485 ceph_put_snap_context(img_request->snapc);
1486
1487 kfree(img_request);
1488}
1489
1490static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1491 struct bio *bio_list)
1492{
1493 struct rbd_device *rbd_dev = img_request->rbd_dev;
1494 struct rbd_obj_request *obj_request = NULL;
1495 struct rbd_obj_request *next_obj_request;
1496 unsigned int bio_offset;
1497 u64 image_offset;
1498 u64 resid;
1499 u16 opcode;
1500
1501 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1502 : CEPH_OSD_OP_READ;
1503 bio_offset = 0;
1504 image_offset = img_request->offset;
1505 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1506 resid = img_request->length;
1507 while (resid) {
1508 const char *object_name;
1509 unsigned int clone_size;
1510 struct ceph_osd_req_op *op;
1511 u64 offset;
1512 u64 length;
1513
1514 object_name = rbd_segment_name(rbd_dev, image_offset);
1515 if (!object_name)
1516 goto out_unwind;
1517 offset = rbd_segment_offset(rbd_dev, image_offset);
1518 length = rbd_segment_length(rbd_dev, image_offset, resid);
1519 obj_request = rbd_obj_request_create(object_name,
1520 offset, length,
1521 OBJ_REQUEST_BIO);
1522 kfree(object_name); /* object request has its own copy */
1523 if (!obj_request)
1524 goto out_unwind;
1525
1526 rbd_assert(length <= (u64) UINT_MAX);
1527 clone_size = (unsigned int) length;
1528 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1529 &bio_offset, clone_size,
1530 GFP_ATOMIC);
1531 if (!obj_request->bio_list)
1532 goto out_partial;
1533
1534 /*
1535 * Build up the op to use in building the osd
1536 * request. Note that the contents of the op are
1537 * copied by rbd_osd_req_create().
1538 */
1539 op = rbd_osd_req_op_create(opcode, offset, length);
1540 if (!op)
1541 goto out_partial;
1542 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1543 img_request->write_request,
1544 obj_request, op);
1545 rbd_osd_req_op_destroy(op);
1546 if (!obj_request->osd_req)
1547 goto out_partial;
1548 /* status and version are initially zero-filled */
1549
1550 rbd_img_obj_request_add(img_request, obj_request);
1551
1552 image_offset += length;
1553 resid -= length;
1554 }
1555
1556 return 0;
1557
1558out_partial:
1559 rbd_obj_request_put(obj_request);
1560out_unwind:
1561 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1562 rbd_obj_request_put(obj_request);
1563
1564 return -ENOMEM;
1565}
1566
1567static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1568{
1569 struct rbd_img_request *img_request;
1570 u32 which = obj_request->which;
1571 bool more = true;
1572
1573 img_request = obj_request->img_request;
1574 rbd_assert(img_request != NULL);
1575 rbd_assert(img_request->rq != NULL);
1576 rbd_assert(which != BAD_WHICH);
1577 rbd_assert(which < img_request->obj_request_count);
1578 rbd_assert(which >= img_request->next_completion);
1579
1580 spin_lock_irq(&img_request->completion_lock);
1581 if (which != img_request->next_completion)
1582 goto out;
1583
1584 for_each_obj_request_from(img_request, obj_request) {
1585 unsigned int xferred;
1586 int result;
1587
1588 rbd_assert(more);
1589 rbd_assert(which < img_request->obj_request_count);
1590
1591 if (!atomic_read(&obj_request->done))
1592 break;
1593
1594 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1595 xferred = (unsigned int) obj_request->xferred;
1596 result = (int) obj_request->result;
1597 if (result)
1598 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1599 img_request->write_request ? "write" : "read",
1600 result, xferred);
1601
1602 more = blk_end_request(img_request->rq, result, xferred);
1603 which++;
1604 }
1605 rbd_assert(more ^ (which == img_request->obj_request_count));
1606 img_request->next_completion = which;
1607out:
1608 spin_unlock_irq(&img_request->completion_lock);
1609
1610 if (!more)
1611 rbd_img_request_complete(img_request);
1612}
1613
1614static int rbd_img_request_submit(struct rbd_img_request *img_request)
1615{
1616 struct rbd_device *rbd_dev = img_request->rbd_dev;
1617 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1618 struct rbd_obj_request *obj_request;
1619
1620 for_each_obj_request(img_request, obj_request) {
1621 int ret;
1622
1623 obj_request->callback = rbd_img_obj_callback;
1624 ret = rbd_obj_request_submit(osdc, obj_request);
1625 if (ret)
1626 return ret;
1627 /*
1628 * The image request has its own reference to each
1629 * of its object requests, so we can safely drop the
1630 * initial one here.
1631 */
1632 rbd_obj_request_put(obj_request);
1633 }
1634
1635 return 0;
1636}
1637
Alex Eldercf81b602013-01-17 12:18:46 -06001638static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06001639 u64 ver, u64 notify_id)
1640{
1641 struct rbd_obj_request *obj_request;
1642 struct ceph_osd_req_op *op;
1643 struct ceph_osd_client *osdc;
1644 int ret;
1645
1646 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1647 OBJ_REQUEST_NODATA);
1648 if (!obj_request)
1649 return -ENOMEM;
1650
1651 ret = -ENOMEM;
1652 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1653 if (!op)
1654 goto out;
1655 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1656 obj_request, op);
1657 rbd_osd_req_op_destroy(op);
1658 if (!obj_request->osd_req)
1659 goto out;
1660
1661 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Eldercf81b602013-01-17 12:18:46 -06001662 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06001663 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001664out:
Alex Eldercf81b602013-01-17 12:18:46 -06001665 if (ret)
1666 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001667
1668 return ret;
1669}
1670
1671static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1672{
1673 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1674 u64 hver;
1675 int rc;
1676
1677 if (!rbd_dev)
1678 return;
1679
1680 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1681 rbd_dev->header_name, (unsigned long long) notify_id,
1682 (unsigned int) opcode);
1683 rc = rbd_dev_refresh(rbd_dev, &hver);
1684 if (rc)
1685 rbd_warn(rbd_dev, "got notification but failed to "
1686 " update snaps: %d\n", rc);
1687
Alex Eldercf81b602013-01-17 12:18:46 -06001688 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06001689}
1690
Alex Elder9969ebc2013-01-18 12:31:10 -06001691/*
1692 * Request sync osd watch/unwatch. The value of "start" determines
1693 * whether a watch request is being initiated or torn down.
1694 */
1695static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1696{
1697 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1698 struct rbd_obj_request *obj_request;
1699 struct ceph_osd_req_op *op;
1700 int ret;
1701
1702 rbd_assert(start ^ !!rbd_dev->watch_event);
1703 rbd_assert(start ^ !!rbd_dev->watch_request);
1704
1705 if (start) {
1706 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1707 &rbd_dev->watch_event);
1708 if (ret < 0)
1709 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06001710 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06001711 }
1712
1713 ret = -ENOMEM;
1714 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1715 OBJ_REQUEST_NODATA);
1716 if (!obj_request)
1717 goto out_cancel;
1718
1719 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1720 rbd_dev->watch_event->cookie,
1721 rbd_dev->header.obj_version, start);
1722 if (!op)
1723 goto out_cancel;
1724 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1725 obj_request, op);
1726 rbd_osd_req_op_destroy(op);
1727 if (!obj_request->osd_req)
1728 goto out_cancel;
1729
Alex Elder8eb87562013-01-25 17:08:55 -06001730 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06001731 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06001732 else
Alex Elder6977c3f2013-01-25 17:08:55 -06001733 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06001734 rbd_dev->watch_request->osd_req);
Alex Elder9969ebc2013-01-18 12:31:10 -06001735 ret = rbd_obj_request_submit(osdc, obj_request);
1736 if (ret)
1737 goto out_cancel;
1738 ret = rbd_obj_request_wait(obj_request);
1739 if (ret)
1740 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06001741 ret = obj_request->result;
1742 if (ret)
1743 goto out_cancel;
1744
Alex Elder8eb87562013-01-25 17:08:55 -06001745 /*
1746 * A watch request is set to linger, so the underlying osd
1747 * request won't go away until we unregister it. We retain
1748 * a pointer to the object request during that time (in
1749 * rbd_dev->watch_request), so we'll keep a reference to
1750 * it. We'll drop that reference (below) after we've
1751 * unregistered it.
1752 */
1753 if (start) {
1754 rbd_dev->watch_request = obj_request;
1755
1756 return 0;
1757 }
1758
1759 /* We have successfully torn down the watch request */
1760
1761 rbd_obj_request_put(rbd_dev->watch_request);
1762 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06001763out_cancel:
1764 /* Cancel the event if we're tearing down, or on error */
1765 ceph_osdc_cancel_event(rbd_dev->watch_event);
1766 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06001767 if (obj_request)
1768 rbd_obj_request_put(obj_request);
1769
1770 return ret;
1771}
1772
Alex Elder36be9a72013-01-19 00:30:28 -06001773/*
1774 * Synchronous osd object method call
1775 */
1776static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1777 const char *object_name,
1778 const char *class_name,
1779 const char *method_name,
1780 const char *outbound,
1781 size_t outbound_size,
1782 char *inbound,
1783 size_t inbound_size,
1784 u64 *version)
1785{
1786 struct rbd_obj_request *obj_request;
1787 struct ceph_osd_client *osdc;
1788 struct ceph_osd_req_op *op;
1789 struct page **pages;
1790 u32 page_count;
1791 int ret;
1792
1793 /*
1794 * Method calls are ultimately read operations but they
1795 * don't involve object data (so no offset or length).
1796 * The result should placed into the inbound buffer
1797 * provided. They also supply outbound data--parameters for
1798 * the object method. Currently if this is present it will
1799 * be a snapshot id.
1800 */
1801 page_count = (u32) calc_pages_for(0, inbound_size);
1802 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1803 if (IS_ERR(pages))
1804 return PTR_ERR(pages);
1805
1806 ret = -ENOMEM;
1807 obj_request = rbd_obj_request_create(object_name, 0, 0,
1808 OBJ_REQUEST_PAGES);
1809 if (!obj_request)
1810 goto out;
1811
1812 obj_request->pages = pages;
1813 obj_request->page_count = page_count;
1814
1815 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1816 method_name, outbound, outbound_size);
1817 if (!op)
1818 goto out;
1819 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1820 obj_request, op);
1821 rbd_osd_req_op_destroy(op);
1822 if (!obj_request->osd_req)
1823 goto out;
1824
1825 osdc = &rbd_dev->rbd_client->client->osdc;
1826 ret = rbd_obj_request_submit(osdc, obj_request);
1827 if (ret)
1828 goto out;
1829 ret = rbd_obj_request_wait(obj_request);
1830 if (ret)
1831 goto out;
1832
1833 ret = obj_request->result;
1834 if (ret < 0)
1835 goto out;
1836 ret = ceph_copy_from_page_vector(pages, inbound, 0,
1837 obj_request->xferred);
1838 if (version)
1839 *version = obj_request->version;
1840out:
1841 if (obj_request)
1842 rbd_obj_request_put(obj_request);
1843 else
1844 ceph_release_page_vector(pages, page_count);
1845
1846 return ret;
1847}
1848
Alex Elderbf0d5f502012-11-22 00:00:08 -06001849static void rbd_request_fn(struct request_queue *q)
1850{
1851 struct rbd_device *rbd_dev = q->queuedata;
1852 bool read_only = rbd_dev->mapping.read_only;
1853 struct request *rq;
1854 int result;
1855
1856 while ((rq = blk_fetch_request(q))) {
1857 bool write_request = rq_data_dir(rq) == WRITE;
1858 struct rbd_img_request *img_request;
1859 u64 offset;
1860 u64 length;
1861
1862 /* Ignore any non-FS requests that filter through. */
1863
1864 if (rq->cmd_type != REQ_TYPE_FS) {
1865 __blk_end_request_all(rq, 0);
1866 continue;
1867 }
1868
1869 spin_unlock_irq(q->queue_lock);
1870
1871 /* Disallow writes to a read-only device */
1872
1873 if (write_request) {
1874 result = -EROFS;
1875 if (read_only)
1876 goto end_request;
1877 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1878 }
1879
1880 /* Quit early if the snapshot has disappeared */
1881
1882 if (!atomic_read(&rbd_dev->exists)) {
1883 dout("request for non-existent snapshot");
1884 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1885 result = -ENXIO;
1886 goto end_request;
1887 }
1888
1889 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1890 length = (u64) blk_rq_bytes(rq);
1891
1892 result = -EINVAL;
1893 if (WARN_ON(offset && length > U64_MAX - offset + 1))
1894 goto end_request; /* Shouldn't happen */
1895
1896 result = -ENOMEM;
1897 img_request = rbd_img_request_create(rbd_dev, offset, length,
1898 write_request);
1899 if (!img_request)
1900 goto end_request;
1901
1902 img_request->rq = rq;
1903
1904 result = rbd_img_request_fill_bio(img_request, rq->bio);
1905 if (!result)
1906 result = rbd_img_request_submit(img_request);
1907 if (result)
1908 rbd_img_request_put(img_request);
1909end_request:
1910 spin_lock_irq(q->queue_lock);
1911 if (result < 0) {
1912 rbd_warn(rbd_dev, "obj_request %s result %d\n",
1913 write_request ? "write" : "read", result);
1914 __blk_end_request_all(rq, result);
1915 }
1916 }
1917}
1918
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001919/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001920 * a queue callback. Makes sure that we don't create a bio that spans across
1921 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001922 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001923 */
1924static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1925 struct bio_vec *bvec)
1926{
1927 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001928 sector_t sector_offset;
1929 sector_t sectors_per_obj;
1930 sector_t obj_sector_offset;
1931 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001932
Alex Eldere5cfeed22012-10-20 22:17:27 -05001933 /*
1934 * Find how far into its rbd object the partition-relative
1935 * bio start sector is to offset relative to the enclosing
1936 * device.
1937 */
1938 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1939 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1940 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001941
Alex Eldere5cfeed22012-10-20 22:17:27 -05001942 /*
1943 * Compute the number of bytes from that offset to the end
1944 * of the object. Account for what's already used by the bio.
1945 */
1946 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1947 if (ret > bmd->bi_size)
1948 ret -= bmd->bi_size;
1949 else
1950 ret = 0;
1951
1952 /*
1953 * Don't send back more than was asked for. And if the bio
1954 * was empty, let the whole thing through because: "Note
1955 * that a block device *must* allow a single page to be
1956 * added to an empty bio."
1957 */
1958 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1959 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1960 ret = (int) bvec->bv_len;
1961
1962 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001963}
1964
1965static void rbd_free_disk(struct rbd_device *rbd_dev)
1966{
1967 struct gendisk *disk = rbd_dev->disk;
1968
1969 if (!disk)
1970 return;
1971
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001972 if (disk->flags & GENHD_FL_UP)
1973 del_gendisk(disk);
1974 if (disk->queue)
1975 blk_cleanup_queue(disk->queue);
1976 put_disk(disk);
1977}
1978
Alex Elder788e2df2013-01-17 12:25:27 -06001979static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
1980 const char *object_name,
1981 u64 offset, u64 length,
1982 char *buf, u64 *version)
1983
1984{
1985 struct ceph_osd_req_op *op;
1986 struct rbd_obj_request *obj_request;
1987 struct ceph_osd_client *osdc;
1988 struct page **pages = NULL;
1989 u32 page_count;
1990 int ret;
1991
1992 page_count = (u32) calc_pages_for(offset, length);
1993 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1994 if (IS_ERR(pages))
1995 ret = PTR_ERR(pages);
1996
1997 ret = -ENOMEM;
1998 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06001999 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002000 if (!obj_request)
2001 goto out;
2002
2003 obj_request->pages = pages;
2004 obj_request->page_count = page_count;
2005
2006 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2007 if (!op)
2008 goto out;
2009 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2010 obj_request, op);
2011 rbd_osd_req_op_destroy(op);
2012 if (!obj_request->osd_req)
2013 goto out;
2014
2015 osdc = &rbd_dev->rbd_client->client->osdc;
2016 ret = rbd_obj_request_submit(osdc, obj_request);
2017 if (ret)
2018 goto out;
2019 ret = rbd_obj_request_wait(obj_request);
2020 if (ret)
2021 goto out;
2022
2023 ret = obj_request->result;
2024 if (ret < 0)
2025 goto out;
2026 ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
2027 if (version)
2028 *version = obj_request->version;
2029out:
2030 if (obj_request)
2031 rbd_obj_request_put(obj_request);
2032 else
2033 ceph_release_page_vector(pages, page_count);
2034
2035 return ret;
2036}
2037
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002038/*
Alex Elder4156d992012-08-02 11:29:46 -05002039 * Read the complete header for the given rbd device.
2040 *
2041 * Returns a pointer to a dynamically-allocated buffer containing
2042 * the complete and validated header. Caller can pass the address
2043 * of a variable that will be filled in with the version of the
2044 * header object at the time it was read.
2045 *
2046 * Returns a pointer-coded errno if a failure occurs.
2047 */
2048static struct rbd_image_header_ondisk *
2049rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2050{
2051 struct rbd_image_header_ondisk *ondisk = NULL;
2052 u32 snap_count = 0;
2053 u64 names_size = 0;
2054 u32 want_count;
2055 int ret;
2056
2057 /*
2058 * The complete header will include an array of its 64-bit
2059 * snapshot ids, followed by the names of those snapshots as
2060 * a contiguous block of NUL-terminated strings. Note that
2061 * the number of snapshots could change by the time we read
2062 * it in, in which case we re-read it.
2063 */
2064 do {
2065 size_t size;
2066
2067 kfree(ondisk);
2068
2069 size = sizeof (*ondisk);
2070 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2071 size += names_size;
2072 ondisk = kmalloc(size, GFP_KERNEL);
2073 if (!ondisk)
2074 return ERR_PTR(-ENOMEM);
2075
Alex Elder788e2df2013-01-17 12:25:27 -06002076 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002077 0, size,
2078 (char *) ondisk, version);
2079
2080 if (ret < 0)
2081 goto out_err;
2082 if (WARN_ON((size_t) ret < size)) {
2083 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002084 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2085 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002086 goto out_err;
2087 }
2088 if (!rbd_dev_ondisk_valid(ondisk)) {
2089 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002090 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002091 goto out_err;
2092 }
2093
2094 names_size = le64_to_cpu(ondisk->snap_names_len);
2095 want_count = snap_count;
2096 snap_count = le32_to_cpu(ondisk->snap_count);
2097 } while (snap_count != want_count);
2098
2099 return ondisk;
2100
2101out_err:
2102 kfree(ondisk);
2103
2104 return ERR_PTR(ret);
2105}
2106
2107/*
2108 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002109 */
2110static int rbd_read_header(struct rbd_device *rbd_dev,
2111 struct rbd_image_header *header)
2112{
Alex Elder4156d992012-08-02 11:29:46 -05002113 struct rbd_image_header_ondisk *ondisk;
2114 u64 ver = 0;
2115 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002116
Alex Elder4156d992012-08-02 11:29:46 -05002117 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2118 if (IS_ERR(ondisk))
2119 return PTR_ERR(ondisk);
2120 ret = rbd_header_from_disk(header, ondisk);
2121 if (ret >= 0)
2122 header->obj_version = ver;
2123 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002124
Alex Elder4156d992012-08-02 11:29:46 -05002125 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002126}
2127
Alex Elder41f38c22012-10-25 23:34:40 -05002128static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002129{
2130 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002131 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002132
Alex Eldera0593292012-07-19 09:09:27 -05002133 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002134 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002135}
2136
Alex Elder94785542012-10-09 13:50:17 -07002137static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2138{
2139 sector_t size;
2140
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002141 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002142 return;
2143
2144 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2145 dout("setting size to %llu sectors", (unsigned long long) size);
2146 rbd_dev->mapping.size = (u64) size;
2147 set_capacity(rbd_dev->disk, size);
2148}
2149
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002150/*
2151 * only read the first part of the ondisk header, without the snaps info
2152 */
Alex Elder117973f2012-08-31 17:29:55 -05002153static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002154{
2155 int ret;
2156 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002157
2158 ret = rbd_read_header(rbd_dev, &h);
2159 if (ret < 0)
2160 return ret;
2161
Josh Durgina51aa0c2011-12-05 10:35:04 -08002162 down_write(&rbd_dev->header_rwsem);
2163
Alex Elder94785542012-10-09 13:50:17 -07002164 /* Update image size, and check for resize of mapped image */
2165 rbd_dev->header.image_size = h.image_size;
2166 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07002167
Alex Elder849b4262012-07-09 21:04:24 -05002168 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002169 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05002170 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08002171 /* osd requests may still refer to snapc */
2172 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002173
Alex Elderb8136232012-07-25 09:32:41 -05002174 if (hver)
2175 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08002176 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08002177 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002178 rbd_dev->header.snapc = h.snapc;
2179 rbd_dev->header.snap_names = h.snap_names;
2180 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05002181 /* Free the extra copy of the object prefix */
2182 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2183 kfree(h.object_prefix);
2184
Alex Elder304f6802012-08-31 17:29:52 -05002185 ret = rbd_dev_snaps_update(rbd_dev);
2186 if (!ret)
2187 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002188
Josh Durginc6666012011-11-21 17:11:12 -08002189 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002190
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002191 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002192}
2193
Alex Elder117973f2012-08-31 17:29:55 -05002194static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05002195{
2196 int ret;
2197
Alex Elder117973f2012-08-31 17:29:55 -05002198 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05002199 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05002200 if (rbd_dev->image_format == 1)
2201 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2202 else
2203 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05002204 mutex_unlock(&ctl_mutex);
2205
2206 return ret;
2207}
2208
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002209static int rbd_init_disk(struct rbd_device *rbd_dev)
2210{
2211 struct gendisk *disk;
2212 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06002213 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002214
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002215 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002216 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2217 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002218 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002219
Alex Elderf0f8cef2012-01-29 13:57:44 -06002220 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05002221 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002222 disk->major = rbd_dev->major;
2223 disk->first_minor = 0;
2224 disk->fops = &rbd_bd_ops;
2225 disk->private_data = rbd_dev;
2226
Alex Elderbf0d5f502012-11-22 00:00:08 -06002227 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002228 if (!q)
2229 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07002230
Alex Elder593a9e72012-02-07 12:03:37 -06002231 /* We use the default size, but let's be explicit about it. */
2232 blk_queue_physical_block_size(q, SECTOR_SIZE);
2233
Josh Durgin029bcbd2011-07-22 11:35:23 -07002234 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06002235 segment_size = rbd_obj_bytes(&rbd_dev->header);
2236 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2237 blk_queue_max_segment_size(q, segment_size);
2238 blk_queue_io_min(q, segment_size);
2239 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07002240
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002241 blk_queue_merge_bvec(q, rbd_merge_bvec);
2242 disk->queue = q;
2243
2244 q->queuedata = rbd_dev;
2245
2246 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002247
Alex Elder12f02942012-08-29 17:11:07 -05002248 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2249
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002250 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002251out_disk:
2252 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002253
2254 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002255}
2256
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002257/*
2258 sysfs
2259*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002260
Alex Elder593a9e72012-02-07 12:03:37 -06002261static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2262{
2263 return container_of(dev, struct rbd_device, dev);
2264}
2265
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002266static ssize_t rbd_size_show(struct device *dev,
2267 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002268{
Alex Elder593a9e72012-02-07 12:03:37 -06002269 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08002270 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002271
Josh Durgina51aa0c2011-12-05 10:35:04 -08002272 down_read(&rbd_dev->header_rwsem);
2273 size = get_capacity(rbd_dev->disk);
2274 up_read(&rbd_dev->header_rwsem);
2275
2276 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002277}
2278
Alex Elder34b13182012-07-13 20:35:12 -05002279/*
2280 * Note this shows the features for whatever's mapped, which is not
2281 * necessarily the base image.
2282 */
2283static ssize_t rbd_features_show(struct device *dev,
2284 struct device_attribute *attr, char *buf)
2285{
2286 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2287
2288 return sprintf(buf, "0x%016llx\n",
2289 (unsigned long long) rbd_dev->mapping.features);
2290}
2291
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002292static ssize_t rbd_major_show(struct device *dev,
2293 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002294{
Alex Elder593a9e72012-02-07 12:03:37 -06002295 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002296
2297 return sprintf(buf, "%d\n", rbd_dev->major);
2298}
2299
2300static ssize_t rbd_client_id_show(struct device *dev,
2301 struct device_attribute *attr, char *buf)
2302{
Alex Elder593a9e72012-02-07 12:03:37 -06002303 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002304
Alex Elder1dbb4392012-01-24 10:08:37 -06002305 return sprintf(buf, "client%lld\n",
2306 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002307}
2308
2309static ssize_t rbd_pool_show(struct device *dev,
2310 struct device_attribute *attr, char *buf)
2311{
Alex Elder593a9e72012-02-07 12:03:37 -06002312 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002313
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002314 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002315}
2316
Alex Elder9bb2f332012-07-12 10:46:35 -05002317static ssize_t rbd_pool_id_show(struct device *dev,
2318 struct device_attribute *attr, char *buf)
2319{
2320 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2321
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002322 return sprintf(buf, "%llu\n",
2323 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002324}
2325
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002326static ssize_t rbd_name_show(struct device *dev,
2327 struct device_attribute *attr, char *buf)
2328{
Alex Elder593a9e72012-02-07 12:03:37 -06002329 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002330
Alex Eldera92ffdf2012-10-30 19:40:33 -05002331 if (rbd_dev->spec->image_name)
2332 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2333
2334 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002335}
2336
Alex Elder589d30e2012-07-10 20:30:11 -05002337static ssize_t rbd_image_id_show(struct device *dev,
2338 struct device_attribute *attr, char *buf)
2339{
2340 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2341
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002342 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002343}
2344
Alex Elder34b13182012-07-13 20:35:12 -05002345/*
2346 * Shows the name of the currently-mapped snapshot (or
2347 * RBD_SNAP_HEAD_NAME for the base image).
2348 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002349static ssize_t rbd_snap_show(struct device *dev,
2350 struct device_attribute *attr,
2351 char *buf)
2352{
Alex Elder593a9e72012-02-07 12:03:37 -06002353 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002354
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002355 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002356}
2357
Alex Elder86b00e02012-10-25 23:34:42 -05002358/*
2359 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2360 * for the parent image. If there is no parent, simply shows
2361 * "(no parent image)".
2362 */
2363static ssize_t rbd_parent_show(struct device *dev,
2364 struct device_attribute *attr,
2365 char *buf)
2366{
2367 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2368 struct rbd_spec *spec = rbd_dev->parent_spec;
2369 int count;
2370 char *bufp = buf;
2371
2372 if (!spec)
2373 return sprintf(buf, "(no parent image)\n");
2374
2375 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2376 (unsigned long long) spec->pool_id, spec->pool_name);
2377 if (count < 0)
2378 return count;
2379 bufp += count;
2380
2381 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2382 spec->image_name ? spec->image_name : "(unknown)");
2383 if (count < 0)
2384 return count;
2385 bufp += count;
2386
2387 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2388 (unsigned long long) spec->snap_id, spec->snap_name);
2389 if (count < 0)
2390 return count;
2391 bufp += count;
2392
2393 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2394 if (count < 0)
2395 return count;
2396 bufp += count;
2397
2398 return (ssize_t) (bufp - buf);
2399}
2400
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002401static ssize_t rbd_image_refresh(struct device *dev,
2402 struct device_attribute *attr,
2403 const char *buf,
2404 size_t size)
2405{
Alex Elder593a9e72012-02-07 12:03:37 -06002406 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002407 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002408
Alex Elder117973f2012-08-31 17:29:55 -05002409 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002410
2411 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002412}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002413
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002414static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002415static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002416static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2417static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2418static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002419static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002420static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002421static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002422static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2423static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002424static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002425
2426static struct attribute *rbd_attrs[] = {
2427 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002428 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002429 &dev_attr_major.attr,
2430 &dev_attr_client_id.attr,
2431 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002432 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002433 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002434 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002435 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002436 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002437 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002438 NULL
2439};
2440
2441static struct attribute_group rbd_attr_group = {
2442 .attrs = rbd_attrs,
2443};
2444
2445static const struct attribute_group *rbd_attr_groups[] = {
2446 &rbd_attr_group,
2447 NULL
2448};
2449
2450static void rbd_sysfs_dev_release(struct device *dev)
2451{
2452}
2453
2454static struct device_type rbd_device_type = {
2455 .name = "rbd",
2456 .groups = rbd_attr_groups,
2457 .release = rbd_sysfs_dev_release,
2458};
2459
2460
2461/*
2462 sysfs - snapshots
2463*/
2464
2465static ssize_t rbd_snap_size_show(struct device *dev,
2466 struct device_attribute *attr,
2467 char *buf)
2468{
2469 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2470
Josh Durgin35915382011-12-05 18:25:13 -08002471 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002472}
2473
2474static ssize_t rbd_snap_id_show(struct device *dev,
2475 struct device_attribute *attr,
2476 char *buf)
2477{
2478 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2479
Josh Durgin35915382011-12-05 18:25:13 -08002480 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002481}
2482
Alex Elder34b13182012-07-13 20:35:12 -05002483static ssize_t rbd_snap_features_show(struct device *dev,
2484 struct device_attribute *attr,
2485 char *buf)
2486{
2487 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2488
2489 return sprintf(buf, "0x%016llx\n",
2490 (unsigned long long) snap->features);
2491}
2492
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002493static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2494static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002495static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002496
2497static struct attribute *rbd_snap_attrs[] = {
2498 &dev_attr_snap_size.attr,
2499 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002500 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002501 NULL,
2502};
2503
2504static struct attribute_group rbd_snap_attr_group = {
2505 .attrs = rbd_snap_attrs,
2506};
2507
2508static void rbd_snap_dev_release(struct device *dev)
2509{
2510 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2511 kfree(snap->name);
2512 kfree(snap);
2513}
2514
2515static const struct attribute_group *rbd_snap_attr_groups[] = {
2516 &rbd_snap_attr_group,
2517 NULL
2518};
2519
2520static struct device_type rbd_snap_device_type = {
2521 .groups = rbd_snap_attr_groups,
2522 .release = rbd_snap_dev_release,
2523};
2524
Alex Elder8b8fb992012-10-26 17:25:24 -05002525static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2526{
2527 kref_get(&spec->kref);
2528
2529 return spec;
2530}
2531
2532static void rbd_spec_free(struct kref *kref);
2533static void rbd_spec_put(struct rbd_spec *spec)
2534{
2535 if (spec)
2536 kref_put(&spec->kref, rbd_spec_free);
2537}
2538
2539static struct rbd_spec *rbd_spec_alloc(void)
2540{
2541 struct rbd_spec *spec;
2542
2543 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2544 if (!spec)
2545 return NULL;
2546 kref_init(&spec->kref);
2547
2548 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2549
2550 return spec;
2551}
2552
2553static void rbd_spec_free(struct kref *kref)
2554{
2555 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2556
2557 kfree(spec->pool_name);
2558 kfree(spec->image_id);
2559 kfree(spec->image_name);
2560 kfree(spec->snap_name);
2561 kfree(spec);
2562}
2563
Alex Elderc53d5892012-10-25 23:34:42 -05002564struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2565 struct rbd_spec *spec)
2566{
2567 struct rbd_device *rbd_dev;
2568
2569 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2570 if (!rbd_dev)
2571 return NULL;
2572
2573 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002574 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002575 INIT_LIST_HEAD(&rbd_dev->node);
2576 INIT_LIST_HEAD(&rbd_dev->snaps);
2577 init_rwsem(&rbd_dev->header_rwsem);
2578
2579 rbd_dev->spec = spec;
2580 rbd_dev->rbd_client = rbdc;
2581
Alex Elder0903e872012-11-14 12:25:19 -06002582 /* Initialize the layout used for all rbd requests */
2583
2584 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2585 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2586 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2587 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2588
Alex Elderc53d5892012-10-25 23:34:42 -05002589 return rbd_dev;
2590}
2591
2592static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2593{
Alex Elder86b00e02012-10-25 23:34:42 -05002594 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002595 kfree(rbd_dev->header_name);
2596 rbd_put_client(rbd_dev->rbd_client);
2597 rbd_spec_put(rbd_dev->spec);
2598 kfree(rbd_dev);
2599}
2600
Alex Elder304f6802012-08-31 17:29:52 -05002601static bool rbd_snap_registered(struct rbd_snap *snap)
2602{
2603 bool ret = snap->dev.type == &rbd_snap_device_type;
2604 bool reg = device_is_registered(&snap->dev);
2605
2606 rbd_assert(!ret ^ reg);
2607
2608 return ret;
2609}
2610
Alex Elder41f38c22012-10-25 23:34:40 -05002611static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002612{
2613 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002614 if (device_is_registered(&snap->dev))
2615 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002616}
2617
Alex Elder14e70852012-07-19 09:09:27 -05002618static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002619 struct device *parent)
2620{
2621 struct device *dev = &snap->dev;
2622 int ret;
2623
2624 dev->type = &rbd_snap_device_type;
2625 dev->parent = parent;
2626 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002627 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002628 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2629
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002630 ret = device_register(dev);
2631
2632 return ret;
2633}
2634
Alex Elder4e891e02012-07-10 20:30:10 -05002635static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002636 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002637 u64 snap_id, u64 snap_size,
2638 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002639{
Alex Elder4e891e02012-07-10 20:30:10 -05002640 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002641 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002642
2643 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002644 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002645 return ERR_PTR(-ENOMEM);
2646
2647 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002648 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002649 if (!snap->name)
2650 goto err;
2651
Alex Elderc8d18422012-07-10 20:30:11 -05002652 snap->id = snap_id;
2653 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002654 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002655
2656 return snap;
2657
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002658err:
2659 kfree(snap->name);
2660 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002661
2662 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002663}
2664
Alex Eldercd892122012-07-03 16:01:19 -05002665static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2666 u64 *snap_size, u64 *snap_features)
2667{
2668 char *snap_name;
2669
2670 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2671
2672 *snap_size = rbd_dev->header.snap_sizes[which];
2673 *snap_features = 0; /* No features for v1 */
2674
2675 /* Skip over names until we find the one we are looking for */
2676
2677 snap_name = rbd_dev->header.snap_names;
2678 while (which--)
2679 snap_name += strlen(snap_name) + 1;
2680
2681 return snap_name;
2682}
2683
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002684/*
Alex Elder9d475de2012-07-03 16:01:19 -05002685 * Get the size and object order for an image snapshot, or if
2686 * snap_id is CEPH_NOSNAP, gets this information for the base
2687 * image.
2688 */
2689static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2690 u8 *order, u64 *snap_size)
2691{
2692 __le64 snapid = cpu_to_le64(snap_id);
2693 int ret;
2694 struct {
2695 u8 order;
2696 __le64 size;
2697 } __attribute__ ((packed)) size_buf = { 0 };
2698
Alex Elder36be9a72013-01-19 00:30:28 -06002699 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05002700 "rbd", "get_size",
2701 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002702 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002703 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05002704 if (ret < 0)
2705 return ret;
2706
2707 *order = size_buf.order;
2708 *snap_size = le64_to_cpu(size_buf.size);
2709
2710 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2711 (unsigned long long) snap_id, (unsigned int) *order,
2712 (unsigned long long) *snap_size);
2713
2714 return 0;
2715}
2716
2717static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2718{
2719 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2720 &rbd_dev->header.obj_order,
2721 &rbd_dev->header.image_size);
2722}
2723
Alex Elder1e130192012-07-03 16:01:19 -05002724static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2725{
2726 void *reply_buf;
2727 int ret;
2728 void *p;
2729
2730 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2731 if (!reply_buf)
2732 return -ENOMEM;
2733
Alex Elder36be9a72013-01-19 00:30:28 -06002734 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder1e130192012-07-03 16:01:19 -05002735 "rbd", "get_object_prefix",
2736 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002737 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002738 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05002739 if (ret < 0)
2740 goto out;
Alex Elder36be9a72013-01-19 00:30:28 -06002741 ret = 0; /* rbd_obj_method_sync() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002742
2743 p = reply_buf;
2744 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2745 p + RBD_OBJ_PREFIX_LEN_MAX,
2746 NULL, GFP_NOIO);
2747
2748 if (IS_ERR(rbd_dev->header.object_prefix)) {
2749 ret = PTR_ERR(rbd_dev->header.object_prefix);
2750 rbd_dev->header.object_prefix = NULL;
2751 } else {
2752 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2753 }
2754
2755out:
2756 kfree(reply_buf);
2757
2758 return ret;
2759}
2760
Alex Elderb1b54022012-07-03 16:01:19 -05002761static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2762 u64 *snap_features)
2763{
2764 __le64 snapid = cpu_to_le64(snap_id);
2765 struct {
2766 __le64 features;
2767 __le64 incompat;
2768 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002769 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002770 int ret;
2771
Alex Elder36be9a72013-01-19 00:30:28 -06002772 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05002773 "rbd", "get_features",
2774 (char *) &snapid, sizeof (snapid),
2775 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002776 NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002777 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05002778 if (ret < 0)
2779 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002780
2781 incompat = le64_to_cpu(features_buf.incompat);
2782 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002783 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002784
Alex Elderb1b54022012-07-03 16:01:19 -05002785 *snap_features = le64_to_cpu(features_buf.features);
2786
2787 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2788 (unsigned long long) snap_id,
2789 (unsigned long long) *snap_features,
2790 (unsigned long long) le64_to_cpu(features_buf.incompat));
2791
2792 return 0;
2793}
2794
2795static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2796{
2797 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2798 &rbd_dev->header.features);
2799}
2800
Alex Elder86b00e02012-10-25 23:34:42 -05002801static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2802{
2803 struct rbd_spec *parent_spec;
2804 size_t size;
2805 void *reply_buf = NULL;
2806 __le64 snapid;
2807 void *p;
2808 void *end;
2809 char *image_id;
2810 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002811 int ret;
2812
2813 parent_spec = rbd_spec_alloc();
2814 if (!parent_spec)
2815 return -ENOMEM;
2816
2817 size = sizeof (__le64) + /* pool_id */
2818 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2819 sizeof (__le64) + /* snap_id */
2820 sizeof (__le64); /* overlap */
2821 reply_buf = kmalloc(size, GFP_KERNEL);
2822 if (!reply_buf) {
2823 ret = -ENOMEM;
2824 goto out_err;
2825 }
2826
2827 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06002828 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05002829 "rbd", "get_parent",
2830 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002831 (char *) reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002832 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05002833 if (ret < 0)
2834 goto out_err;
2835
2836 ret = -ERANGE;
2837 p = reply_buf;
2838 end = (char *) reply_buf + size;
2839 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2840 if (parent_spec->pool_id == CEPH_NOPOOL)
2841 goto out; /* No parent? No problem. */
2842
Alex Elder0903e872012-11-14 12:25:19 -06002843 /* The ceph file layout needs to fit pool id in 32 bits */
2844
2845 ret = -EIO;
2846 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2847 goto out;
2848
Alex Elder979ed482012-11-01 08:39:26 -05002849 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002850 if (IS_ERR(image_id)) {
2851 ret = PTR_ERR(image_id);
2852 goto out_err;
2853 }
2854 parent_spec->image_id = image_id;
2855 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2856 ceph_decode_64_safe(&p, end, overlap, out_err);
2857
2858 rbd_dev->parent_overlap = overlap;
2859 rbd_dev->parent_spec = parent_spec;
2860 parent_spec = NULL; /* rbd_dev now owns this */
2861out:
2862 ret = 0;
2863out_err:
2864 kfree(reply_buf);
2865 rbd_spec_put(parent_spec);
2866
2867 return ret;
2868}
2869
Alex Elder9e15b772012-10-30 19:40:33 -05002870static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2871{
2872 size_t image_id_size;
2873 char *image_id;
2874 void *p;
2875 void *end;
2876 size_t size;
2877 void *reply_buf = NULL;
2878 size_t len = 0;
2879 char *image_name = NULL;
2880 int ret;
2881
2882 rbd_assert(!rbd_dev->spec->image_name);
2883
Alex Elder69e7a022012-11-01 08:39:26 -05002884 len = strlen(rbd_dev->spec->image_id);
2885 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002886 image_id = kmalloc(image_id_size, GFP_KERNEL);
2887 if (!image_id)
2888 return NULL;
2889
2890 p = image_id;
2891 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002892 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002893
2894 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2895 reply_buf = kmalloc(size, GFP_KERNEL);
2896 if (!reply_buf)
2897 goto out;
2898
Alex Elder36be9a72013-01-19 00:30:28 -06002899 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05002900 "rbd", "dir_get_name",
2901 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06002902 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05002903 if (ret < 0)
2904 goto out;
2905 p = reply_buf;
2906 end = (char *) reply_buf + size;
2907 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2908 if (IS_ERR(image_name))
2909 image_name = NULL;
2910 else
2911 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2912out:
2913 kfree(reply_buf);
2914 kfree(image_id);
2915
2916 return image_name;
2917}
2918
2919/*
2920 * When a parent image gets probed, we only have the pool, image,
2921 * and snapshot ids but not the names of any of them. This call
2922 * is made later to fill in those names. It has to be done after
2923 * rbd_dev_snaps_update() has completed because some of the
2924 * information (in particular, snapshot name) is not available
2925 * until then.
2926 */
2927static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2928{
2929 struct ceph_osd_client *osdc;
2930 const char *name;
2931 void *reply_buf = NULL;
2932 int ret;
2933
2934 if (rbd_dev->spec->pool_name)
2935 return 0; /* Already have the names */
2936
2937 /* Look up the pool name */
2938
2939 osdc = &rbd_dev->rbd_client->client->osdc;
2940 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05002941 if (!name) {
2942 rbd_warn(rbd_dev, "there is no pool with id %llu",
2943 rbd_dev->spec->pool_id); /* Really a BUG() */
2944 return -EIO;
2945 }
Alex Elder9e15b772012-10-30 19:40:33 -05002946
2947 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2948 if (!rbd_dev->spec->pool_name)
2949 return -ENOMEM;
2950
2951 /* Fetch the image name; tolerate failure here */
2952
2953 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05002954 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05002955 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05002956 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05002957 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05002958
2959 /* Look up the snapshot name. */
2960
2961 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2962 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05002963 rbd_warn(rbd_dev, "no snapshot with id %llu",
2964 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05002965 ret = -EIO;
2966 goto out_err;
2967 }
2968 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2969 if(!rbd_dev->spec->snap_name)
2970 goto out_err;
2971
2972 return 0;
2973out_err:
2974 kfree(reply_buf);
2975 kfree(rbd_dev->spec->pool_name);
2976 rbd_dev->spec->pool_name = NULL;
2977
2978 return ret;
2979}
2980
Alex Elder6e14b1a2012-07-03 16:01:19 -05002981static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002982{
2983 size_t size;
2984 int ret;
2985 void *reply_buf;
2986 void *p;
2987 void *end;
2988 u64 seq;
2989 u32 snap_count;
2990 struct ceph_snap_context *snapc;
2991 u32 i;
2992
2993 /*
2994 * We'll need room for the seq value (maximum snapshot id),
2995 * snapshot count, and array of that many snapshot ids.
2996 * For now we have a fixed upper limit on the number we're
2997 * prepared to receive.
2998 */
2999 size = sizeof (__le64) + sizeof (__le32) +
3000 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3001 reply_buf = kzalloc(size, GFP_KERNEL);
3002 if (!reply_buf)
3003 return -ENOMEM;
3004
Alex Elder36be9a72013-01-19 00:30:28 -06003005 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder35d489f2012-07-03 16:01:19 -05003006 "rbd", "get_snapcontext",
3007 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003008 reply_buf, size, ver);
Alex Elder36be9a72013-01-19 00:30:28 -06003009 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003010 if (ret < 0)
3011 goto out;
3012
3013 ret = -ERANGE;
3014 p = reply_buf;
3015 end = (char *) reply_buf + size;
3016 ceph_decode_64_safe(&p, end, seq, out);
3017 ceph_decode_32_safe(&p, end, snap_count, out);
3018
3019 /*
3020 * Make sure the reported number of snapshot ids wouldn't go
3021 * beyond the end of our buffer. But before checking that,
3022 * make sure the computed size of the snapshot context we
3023 * allocate is representable in a size_t.
3024 */
3025 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3026 / sizeof (u64)) {
3027 ret = -EINVAL;
3028 goto out;
3029 }
3030 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3031 goto out;
3032
3033 size = sizeof (struct ceph_snap_context) +
3034 snap_count * sizeof (snapc->snaps[0]);
3035 snapc = kmalloc(size, GFP_KERNEL);
3036 if (!snapc) {
3037 ret = -ENOMEM;
3038 goto out;
3039 }
3040
3041 atomic_set(&snapc->nref, 1);
3042 snapc->seq = seq;
3043 snapc->num_snaps = snap_count;
3044 for (i = 0; i < snap_count; i++)
3045 snapc->snaps[i] = ceph_decode_64(&p);
3046
3047 rbd_dev->header.snapc = snapc;
3048
3049 dout(" snap context seq = %llu, snap_count = %u\n",
3050 (unsigned long long) seq, (unsigned int) snap_count);
3051
3052out:
3053 kfree(reply_buf);
3054
3055 return 0;
3056}
3057
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003058static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3059{
3060 size_t size;
3061 void *reply_buf;
3062 __le64 snap_id;
3063 int ret;
3064 void *p;
3065 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003066 char *snap_name;
3067
3068 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3069 reply_buf = kmalloc(size, GFP_KERNEL);
3070 if (!reply_buf)
3071 return ERR_PTR(-ENOMEM);
3072
3073 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003074 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003075 "rbd", "get_snapshot_name",
3076 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003077 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003078 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003079 if (ret < 0)
3080 goto out;
3081
3082 p = reply_buf;
3083 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003084 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003085 if (IS_ERR(snap_name)) {
3086 ret = PTR_ERR(snap_name);
3087 goto out;
3088 } else {
3089 dout(" snap_id 0x%016llx snap_name = %s\n",
3090 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3091 }
3092 kfree(reply_buf);
3093
3094 return snap_name;
3095out:
3096 kfree(reply_buf);
3097
3098 return ERR_PTR(ret);
3099}
3100
3101static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3102 u64 *snap_size, u64 *snap_features)
3103{
Alex Eldere0b49862013-01-09 14:44:18 -06003104 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003105 u8 order;
3106 int ret;
3107
3108 snap_id = rbd_dev->header.snapc->snaps[which];
3109 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3110 if (ret)
3111 return ERR_PTR(ret);
3112 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3113 if (ret)
3114 return ERR_PTR(ret);
3115
3116 return rbd_dev_v2_snap_name(rbd_dev, which);
3117}
3118
3119static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3120 u64 *snap_size, u64 *snap_features)
3121{
3122 if (rbd_dev->image_format == 1)
3123 return rbd_dev_v1_snap_info(rbd_dev, which,
3124 snap_size, snap_features);
3125 if (rbd_dev->image_format == 2)
3126 return rbd_dev_v2_snap_info(rbd_dev, which,
3127 snap_size, snap_features);
3128 return ERR_PTR(-EINVAL);
3129}
3130
Alex Elder117973f2012-08-31 17:29:55 -05003131static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3132{
3133 int ret;
3134 __u8 obj_order;
3135
3136 down_write(&rbd_dev->header_rwsem);
3137
3138 /* Grab old order first, to see if it changes */
3139
3140 obj_order = rbd_dev->header.obj_order,
3141 ret = rbd_dev_v2_image_size(rbd_dev);
3142 if (ret)
3143 goto out;
3144 if (rbd_dev->header.obj_order != obj_order) {
3145 ret = -EIO;
3146 goto out;
3147 }
3148 rbd_update_mapping_size(rbd_dev);
3149
3150 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3151 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3152 if (ret)
3153 goto out;
3154 ret = rbd_dev_snaps_update(rbd_dev);
3155 dout("rbd_dev_snaps_update returned %d\n", ret);
3156 if (ret)
3157 goto out;
3158 ret = rbd_dev_snaps_register(rbd_dev);
3159 dout("rbd_dev_snaps_register returned %d\n", ret);
3160out:
3161 up_write(&rbd_dev->header_rwsem);
3162
3163 return ret;
3164}
3165
Alex Elder9d475de2012-07-03 16:01:19 -05003166/*
Alex Elder35938152012-08-02 11:29:46 -05003167 * Scan the rbd device's current snapshot list and compare it to the
3168 * newly-received snapshot context. Remove any existing snapshots
3169 * not present in the new snapshot context. Add a new snapshot for
3170 * any snaphots in the snapshot context not in the current list.
3171 * And verify there are no changes to snapshots we already know
3172 * about.
3173 *
3174 * Assumes the snapshots in the snapshot context are sorted by
3175 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3176 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003177 */
Alex Elder304f6802012-08-31 17:29:52 -05003178static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003179{
Alex Elder35938152012-08-02 11:29:46 -05003180 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3181 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05003182 struct list_head *head = &rbd_dev->snaps;
3183 struct list_head *links = head->next;
3184 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003185
Alex Elder9fcbb802012-08-23 23:48:49 -05003186 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05003187 while (index < snap_count || links != head) {
3188 u64 snap_id;
3189 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05003190 char *snap_name;
3191 u64 snap_size = 0;
3192 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003193
Alex Elder35938152012-08-02 11:29:46 -05003194 snap_id = index < snap_count ? snapc->snaps[index]
3195 : CEPH_NOSNAP;
3196 snap = links != head ? list_entry(links, struct rbd_snap, node)
3197 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05003198 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003199
Alex Elder35938152012-08-02 11:29:46 -05003200 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3201 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003202
Alex Elder35938152012-08-02 11:29:46 -05003203 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003204
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003205 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06003206 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05003207 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05003208 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003209 rbd_dev->spec->snap_id == snap->id ?
3210 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05003211 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003212
Alex Elder35938152012-08-02 11:29:46 -05003213 /* Done with this list entry; advance */
3214
3215 links = next;
3216 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003217 }
Alex Elder35938152012-08-02 11:29:46 -05003218
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003219 snap_name = rbd_dev_snap_info(rbd_dev, index,
3220 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05003221 if (IS_ERR(snap_name))
3222 return PTR_ERR(snap_name);
3223
Alex Elder9fcbb802012-08-23 23:48:49 -05003224 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3225 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05003226 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3227 struct rbd_snap *new_snap;
3228
3229 /* We haven't seen this snapshot before */
3230
Alex Elderc8d18422012-07-10 20:30:11 -05003231 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05003232 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05003233 if (IS_ERR(new_snap)) {
3234 int err = PTR_ERR(new_snap);
3235
3236 dout(" failed to add dev, error %d\n", err);
3237
3238 return err;
3239 }
Alex Elder35938152012-08-02 11:29:46 -05003240
3241 /* New goes before existing, or at end of list */
3242
Alex Elder9fcbb802012-08-23 23:48:49 -05003243 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05003244 if (snap)
3245 list_add_tail(&new_snap->node, &snap->node);
3246 else
Alex Elder523f3252012-08-30 00:16:37 -05003247 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05003248 } else {
3249 /* Already have this one */
3250
Alex Elder9fcbb802012-08-23 23:48:49 -05003251 dout(" already present\n");
3252
Alex Eldercd892122012-07-03 16:01:19 -05003253 rbd_assert(snap->size == snap_size);
Alex Elderaafb230e2012-09-06 16:00:54 -05003254 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05003255 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05003256
3257 /* Done with this list entry; advance */
3258
3259 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003260 }
Alex Elder35938152012-08-02 11:29:46 -05003261
3262 /* Advance to the next entry in the snapshot context */
3263
3264 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003265 }
Alex Elder9fcbb802012-08-23 23:48:49 -05003266 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003267
3268 return 0;
3269}
3270
Alex Elder304f6802012-08-31 17:29:52 -05003271/*
3272 * Scan the list of snapshots and register the devices for any that
3273 * have not already been registered.
3274 */
3275static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3276{
3277 struct rbd_snap *snap;
3278 int ret = 0;
3279
3280 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05003281 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3282 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05003283
3284 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3285 if (!rbd_snap_registered(snap)) {
3286 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3287 if (ret < 0)
3288 break;
3289 }
3290 }
3291 dout("%s: returning %d\n", __func__, ret);
3292
3293 return ret;
3294}
3295
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003296static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3297{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003298 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003299 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003300
3301 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003302
Alex Eldercd789ab2012-08-30 00:16:38 -05003303 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003304 dev->bus = &rbd_bus_type;
3305 dev->type = &rbd_device_type;
3306 dev->parent = &rbd_root_dev;
3307 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003308 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003309 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003310
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003311 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003312
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003313 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003314}
3315
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003316static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3317{
3318 device_unregister(&rbd_dev->dev);
3319}
3320
Alex Eldere2839302012-08-29 17:11:06 -05003321static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003322
3323/*
Alex Elder499afd52012-02-02 08:13:29 -06003324 * Get a unique rbd identifier for the given new rbd_dev, and add
3325 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003326 */
Alex Eldere2839302012-08-29 17:11:06 -05003327static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003328{
Alex Eldere2839302012-08-29 17:11:06 -05003329 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003330
3331 spin_lock(&rbd_dev_list_lock);
3332 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3333 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003334 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3335 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003336}
Alex Elderb7f23c32012-01-29 13:57:43 -06003337
Alex Elder1ddbe942012-01-29 13:57:44 -06003338/*
Alex Elder499afd52012-02-02 08:13:29 -06003339 * Remove an rbd_dev from the global list, and record that its
3340 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003341 */
Alex Eldere2839302012-08-29 17:11:06 -05003342static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003343{
Alex Elderd184f6b2012-01-29 13:57:44 -06003344 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003345 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003346 int max_id;
3347
Alex Elderaafb230e2012-09-06 16:00:54 -05003348 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003349
Alex Eldere2839302012-08-29 17:11:06 -05003350 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3351 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003352 spin_lock(&rbd_dev_list_lock);
3353 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003354
3355 /*
3356 * If the id being "put" is not the current maximum, there
3357 * is nothing special we need to do.
3358 */
Alex Eldere2839302012-08-29 17:11:06 -05003359 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003360 spin_unlock(&rbd_dev_list_lock);
3361 return;
3362 }
3363
3364 /*
3365 * We need to update the current maximum id. Search the
3366 * list to find out what it is. We're more likely to find
3367 * the maximum at the end, so search the list backward.
3368 */
3369 max_id = 0;
3370 list_for_each_prev(tmp, &rbd_dev_list) {
3371 struct rbd_device *rbd_dev;
3372
3373 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003374 if (rbd_dev->dev_id > max_id)
3375 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003376 }
Alex Elder499afd52012-02-02 08:13:29 -06003377 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003378
Alex Elder1ddbe942012-01-29 13:57:44 -06003379 /*
Alex Eldere2839302012-08-29 17:11:06 -05003380 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003381 * which case it now accurately reflects the new maximum.
3382 * Be careful not to overwrite the maximum value in that
3383 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003384 */
Alex Eldere2839302012-08-29 17:11:06 -05003385 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3386 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003387}
3388
Alex Eldera725f65e2012-02-02 08:13:30 -06003389/*
Alex Eldere28fff262012-02-02 08:13:30 -06003390 * Skips over white space at *buf, and updates *buf to point to the
3391 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003392 * the token (string of non-white space characters) found. Note
3393 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003394 */
3395static inline size_t next_token(const char **buf)
3396{
3397 /*
3398 * These are the characters that produce nonzero for
3399 * isspace() in the "C" and "POSIX" locales.
3400 */
3401 const char *spaces = " \f\n\r\t\v";
3402
3403 *buf += strspn(*buf, spaces); /* Find start of token */
3404
3405 return strcspn(*buf, spaces); /* Return token length */
3406}
3407
3408/*
3409 * Finds the next token in *buf, and if the provided token buffer is
3410 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003411 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3412 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003413 *
3414 * Returns the length of the token found (not including the '\0').
3415 * Return value will be 0 if no token is found, and it will be >=
3416 * token_size if the token would not fit.
3417 *
Alex Elder593a9e72012-02-07 12:03:37 -06003418 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003419 * found token. Note that this occurs even if the token buffer is
3420 * too small to hold it.
3421 */
3422static inline size_t copy_token(const char **buf,
3423 char *token,
3424 size_t token_size)
3425{
3426 size_t len;
3427
3428 len = next_token(buf);
3429 if (len < token_size) {
3430 memcpy(token, *buf, len);
3431 *(token + len) = '\0';
3432 }
3433 *buf += len;
3434
3435 return len;
3436}
3437
3438/*
Alex Elderea3352f2012-07-09 21:04:23 -05003439 * Finds the next token in *buf, dynamically allocates a buffer big
3440 * enough to hold a copy of it, and copies the token into the new
3441 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3442 * that a duplicate buffer is created even for a zero-length token.
3443 *
3444 * Returns a pointer to the newly-allocated duplicate, or a null
3445 * pointer if memory for the duplicate was not available. If
3446 * the lenp argument is a non-null pointer, the length of the token
3447 * (not including the '\0') is returned in *lenp.
3448 *
3449 * If successful, the *buf pointer will be updated to point beyond
3450 * the end of the found token.
3451 *
3452 * Note: uses GFP_KERNEL for allocation.
3453 */
3454static inline char *dup_token(const char **buf, size_t *lenp)
3455{
3456 char *dup;
3457 size_t len;
3458
3459 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003460 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003461 if (!dup)
3462 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003463 *(dup + len) = '\0';
3464 *buf += len;
3465
3466 if (lenp)
3467 *lenp = len;
3468
3469 return dup;
3470}
3471
3472/*
Alex Elder859c31d2012-10-25 23:34:42 -05003473 * Parse the options provided for an "rbd add" (i.e., rbd image
3474 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3475 * and the data written is passed here via a NUL-terminated buffer.
3476 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003477 *
Alex Elder859c31d2012-10-25 23:34:42 -05003478 * The information extracted from these options is recorded in
3479 * the other parameters which return dynamically-allocated
3480 * structures:
3481 * ceph_opts
3482 * The address of a pointer that will refer to a ceph options
3483 * structure. Caller must release the returned pointer using
3484 * ceph_destroy_options() when it is no longer needed.
3485 * rbd_opts
3486 * Address of an rbd options pointer. Fully initialized by
3487 * this function; caller must release with kfree().
3488 * spec
3489 * Address of an rbd image specification pointer. Fully
3490 * initialized by this function based on parsed options.
3491 * Caller must release with rbd_spec_put().
3492 *
3493 * The options passed take this form:
3494 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3495 * where:
3496 * <mon_addrs>
3497 * A comma-separated list of one or more monitor addresses.
3498 * A monitor address is an ip address, optionally followed
3499 * by a port number (separated by a colon).
3500 * I.e.: ip1[:port1][,ip2[:port2]...]
3501 * <options>
3502 * A comma-separated list of ceph and/or rbd options.
3503 * <pool_name>
3504 * The name of the rados pool containing the rbd image.
3505 * <image_name>
3506 * The name of the image in that pool to map.
3507 * <snap_id>
3508 * An optional snapshot id. If provided, the mapping will
3509 * present data from the image at the time that snapshot was
3510 * created. The image head is used if no snapshot id is
3511 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003512 */
Alex Elder859c31d2012-10-25 23:34:42 -05003513static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003514 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003515 struct rbd_options **opts,
3516 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003517{
Alex Elderd22f76e2012-07-12 10:46:35 -05003518 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003519 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003520 const char *mon_addrs;
3521 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003522 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003523 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003524 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003525 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003526
3527 /* The first four tokens are required */
3528
Alex Elder7ef32142012-02-02 08:13:30 -06003529 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05003530 if (!len) {
3531 rbd_warn(NULL, "no monitor address(es) provided");
3532 return -EINVAL;
3533 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003534 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003535 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003536 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003537
Alex Elderdc79b112012-10-25 23:34:41 -05003538 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003539 options = dup_token(&buf, NULL);
3540 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003541 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003542 if (!*options) {
3543 rbd_warn(NULL, "no options provided");
3544 goto out_err;
3545 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003546
Alex Elder859c31d2012-10-25 23:34:42 -05003547 spec = rbd_spec_alloc();
3548 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003549 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003550
3551 spec->pool_name = dup_token(&buf, NULL);
3552 if (!spec->pool_name)
3553 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003554 if (!*spec->pool_name) {
3555 rbd_warn(NULL, "no pool name provided");
3556 goto out_err;
3557 }
Alex Eldere28fff262012-02-02 08:13:30 -06003558
Alex Elder69e7a022012-11-01 08:39:26 -05003559 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003560 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003561 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003562 if (!*spec->image_name) {
3563 rbd_warn(NULL, "no image name provided");
3564 goto out_err;
3565 }
Alex Eldere28fff262012-02-02 08:13:30 -06003566
Alex Elderf28e5652012-10-25 23:34:41 -05003567 /*
3568 * Snapshot name is optional; default is to use "-"
3569 * (indicating the head/no snapshot).
3570 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003571 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003572 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003573 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3574 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003575 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003576 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003577 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003578 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003579 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003580 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003581 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003582 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003583
Alex Elder0ddebc02012-10-25 23:34:41 -05003584 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003585
Alex Elder4e9afeb2012-10-25 23:34:41 -05003586 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3587 if (!rbd_opts)
3588 goto out_mem;
3589
3590 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003591
Alex Elder859c31d2012-10-25 23:34:42 -05003592 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003593 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003594 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003595 if (IS_ERR(copts)) {
3596 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003597 goto out_err;
3598 }
Alex Elder859c31d2012-10-25 23:34:42 -05003599 kfree(options);
3600
3601 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003602 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003603 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003604
Alex Elderdc79b112012-10-25 23:34:41 -05003605 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003606out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003607 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003608out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003609 kfree(rbd_opts);
3610 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003611 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003612
Alex Elderdc79b112012-10-25 23:34:41 -05003613 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003614}
3615
Alex Elder589d30e2012-07-10 20:30:11 -05003616/*
3617 * An rbd format 2 image has a unique identifier, distinct from the
3618 * name given to it by the user. Internally, that identifier is
3619 * what's used to specify the names of objects related to the image.
3620 *
3621 * A special "rbd id" object is used to map an rbd image name to its
3622 * id. If that object doesn't exist, then there is no v2 rbd image
3623 * with the supplied name.
3624 *
3625 * This function will record the given rbd_dev's image_id field if
3626 * it can be determined, and in that case will return 0. If any
3627 * errors occur a negative errno will be returned and the rbd_dev's
3628 * image_id field will be unchanged (and should be NULL).
3629 */
3630static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3631{
3632 int ret;
3633 size_t size;
3634 char *object_name;
3635 void *response;
3636 void *p;
3637
3638 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003639 * When probing a parent image, the image id is already
3640 * known (and the image name likely is not). There's no
3641 * need to fetch the image id again in this case.
3642 */
3643 if (rbd_dev->spec->image_id)
3644 return 0;
3645
3646 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003647 * First, see if the format 2 image id file exists, and if
3648 * so, get the image's persistent id from it.
3649 */
Alex Elder69e7a022012-11-01 08:39:26 -05003650 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003651 object_name = kmalloc(size, GFP_NOIO);
3652 if (!object_name)
3653 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003654 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003655 dout("rbd id object name is %s\n", object_name);
3656
3657 /* Response will be an encoded string, which includes a length */
3658
3659 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3660 response = kzalloc(size, GFP_NOIO);
3661 if (!response) {
3662 ret = -ENOMEM;
3663 goto out;
3664 }
3665
Alex Elder36be9a72013-01-19 00:30:28 -06003666 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder589d30e2012-07-10 20:30:11 -05003667 "rbd", "get_id",
3668 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003669 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003670 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder589d30e2012-07-10 20:30:11 -05003671 if (ret < 0)
3672 goto out;
Alex Elder36be9a72013-01-19 00:30:28 -06003673 ret = 0; /* rbd_obj_method_sync() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003674
3675 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003676 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003677 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003678 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003679 if (IS_ERR(rbd_dev->spec->image_id)) {
3680 ret = PTR_ERR(rbd_dev->spec->image_id);
3681 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003682 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003683 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003684 }
3685out:
3686 kfree(response);
3687 kfree(object_name);
3688
3689 return ret;
3690}
3691
Alex Eldera30b71b2012-07-10 20:30:11 -05003692static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3693{
3694 int ret;
3695 size_t size;
3696
3697 /* Version 1 images have no id; empty string is used */
3698
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003699 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3700 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003701 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003702
3703 /* Record the header object name for this rbd image. */
3704
Alex Elder69e7a022012-11-01 08:39:26 -05003705 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003706 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3707 if (!rbd_dev->header_name) {
3708 ret = -ENOMEM;
3709 goto out_err;
3710 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003711 sprintf(rbd_dev->header_name, "%s%s",
3712 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003713
3714 /* Populate rbd image metadata */
3715
3716 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3717 if (ret < 0)
3718 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003719
3720 /* Version 1 images have no parent (no layering) */
3721
3722 rbd_dev->parent_spec = NULL;
3723 rbd_dev->parent_overlap = 0;
3724
Alex Eldera30b71b2012-07-10 20:30:11 -05003725 rbd_dev->image_format = 1;
3726
3727 dout("discovered version 1 image, header name is %s\n",
3728 rbd_dev->header_name);
3729
3730 return 0;
3731
3732out_err:
3733 kfree(rbd_dev->header_name);
3734 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003735 kfree(rbd_dev->spec->image_id);
3736 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003737
3738 return ret;
3739}
3740
3741static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3742{
3743 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003744 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003745 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003746
3747 /*
3748 * Image id was filled in by the caller. Record the header
3749 * object name for this rbd image.
3750 */
Alex Elder979ed482012-11-01 08:39:26 -05003751 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003752 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3753 if (!rbd_dev->header_name)
3754 return -ENOMEM;
3755 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003756 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003757
3758 /* Get the size and object order for the image */
3759
3760 ret = rbd_dev_v2_image_size(rbd_dev);
3761 if (ret < 0)
3762 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003763
3764 /* Get the object prefix (a.k.a. block_name) for the image */
3765
3766 ret = rbd_dev_v2_object_prefix(rbd_dev);
3767 if (ret < 0)
3768 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003769
Alex Elderd8891402012-10-09 13:50:17 -07003770 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003771
3772 ret = rbd_dev_v2_features(rbd_dev);
3773 if (ret < 0)
3774 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003775
Alex Elder86b00e02012-10-25 23:34:42 -05003776 /* If the image supports layering, get the parent info */
3777
3778 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3779 ret = rbd_dev_v2_parent_info(rbd_dev);
3780 if (ret < 0)
3781 goto out_err;
3782 }
3783
Alex Elder6e14b1a2012-07-03 16:01:19 -05003784 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003785
Alex Elder6e14b1a2012-07-03 16:01:19 -05003786 rbd_dev->header.crypt_type = 0;
3787 rbd_dev->header.comp_type = 0;
3788
3789 /* Get the snapshot context, plus the header version */
3790
3791 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003792 if (ret)
3793 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003794 rbd_dev->header.obj_version = ver;
3795
Alex Eldera30b71b2012-07-10 20:30:11 -05003796 rbd_dev->image_format = 2;
3797
3798 dout("discovered version 2 image, header name is %s\n",
3799 rbd_dev->header_name);
3800
Alex Elder35152972012-08-31 17:29:55 -05003801 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003802out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003803 rbd_dev->parent_overlap = 0;
3804 rbd_spec_put(rbd_dev->parent_spec);
3805 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003806 kfree(rbd_dev->header_name);
3807 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003808 kfree(rbd_dev->header.object_prefix);
3809 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003810
3811 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003812}
3813
Alex Elder83a06262012-10-30 15:47:17 -05003814static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3815{
3816 int ret;
3817
3818 /* no need to lock here, as rbd_dev is not registered yet */
3819 ret = rbd_dev_snaps_update(rbd_dev);
3820 if (ret)
3821 return ret;
3822
Alex Elder9e15b772012-10-30 19:40:33 -05003823 ret = rbd_dev_probe_update_spec(rbd_dev);
3824 if (ret)
3825 goto err_out_snaps;
3826
Alex Elder83a06262012-10-30 15:47:17 -05003827 ret = rbd_dev_set_mapping(rbd_dev);
3828 if (ret)
3829 goto err_out_snaps;
3830
3831 /* generate unique id: find highest unique id, add one */
3832 rbd_dev_id_get(rbd_dev);
3833
3834 /* Fill in the device name, now that we have its id. */
3835 BUILD_BUG_ON(DEV_NAME_LEN
3836 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3837 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3838
3839 /* Get our block major device number. */
3840
3841 ret = register_blkdev(0, rbd_dev->name);
3842 if (ret < 0)
3843 goto err_out_id;
3844 rbd_dev->major = ret;
3845
3846 /* Set up the blkdev mapping. */
3847
3848 ret = rbd_init_disk(rbd_dev);
3849 if (ret)
3850 goto err_out_blkdev;
3851
3852 ret = rbd_bus_add_dev(rbd_dev);
3853 if (ret)
3854 goto err_out_disk;
3855
3856 /*
3857 * At this point cleanup in the event of an error is the job
3858 * of the sysfs code (initiated by rbd_bus_del_dev()).
3859 */
3860 down_write(&rbd_dev->header_rwsem);
3861 ret = rbd_dev_snaps_register(rbd_dev);
3862 up_write(&rbd_dev->header_rwsem);
3863 if (ret)
3864 goto err_out_bus;
3865
Alex Elder9969ebc2013-01-18 12:31:10 -06003866 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05003867 if (ret)
3868 goto err_out_bus;
3869
3870 /* Everything's ready. Announce the disk to the world. */
3871
3872 add_disk(rbd_dev->disk);
3873
3874 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3875 (unsigned long long) rbd_dev->mapping.size);
3876
3877 return ret;
3878err_out_bus:
3879 /* this will also clean up rest of rbd_dev stuff */
3880
3881 rbd_bus_del_dev(rbd_dev);
3882
3883 return ret;
3884err_out_disk:
3885 rbd_free_disk(rbd_dev);
3886err_out_blkdev:
3887 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3888err_out_id:
3889 rbd_dev_id_put(rbd_dev);
3890err_out_snaps:
3891 rbd_remove_all_snaps(rbd_dev);
3892
3893 return ret;
3894}
3895
Alex Eldera30b71b2012-07-10 20:30:11 -05003896/*
3897 * Probe for the existence of the header object for the given rbd
3898 * device. For format 2 images this includes determining the image
3899 * id.
3900 */
3901static int rbd_dev_probe(struct rbd_device *rbd_dev)
3902{
3903 int ret;
3904
3905 /*
3906 * Get the id from the image id object. If it's not a
3907 * format 2 image, we'll get ENOENT back, and we'll assume
3908 * it's a format 1 image.
3909 */
3910 ret = rbd_dev_image_id(rbd_dev);
3911 if (ret)
3912 ret = rbd_dev_v1_probe(rbd_dev);
3913 else
3914 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003915 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003916 dout("probe failed, returning %d\n", ret);
3917
Alex Elder83a06262012-10-30 15:47:17 -05003918 return ret;
3919 }
3920
3921 ret = rbd_dev_probe_finish(rbd_dev);
3922 if (ret)
3923 rbd_header_free(&rbd_dev->header);
3924
Alex Eldera30b71b2012-07-10 20:30:11 -05003925 return ret;
3926}
3927
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003928static ssize_t rbd_add(struct bus_type *bus,
3929 const char *buf,
3930 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003931{
Alex Eldercb8627c2012-07-09 21:04:23 -05003932 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003933 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003934 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003935 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003936 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003937 struct ceph_osd_client *osdc;
3938 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003939
3940 if (!try_module_get(THIS_MODULE))
3941 return -ENODEV;
3942
Alex Eldera725f65e2012-02-02 08:13:30 -06003943 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003944 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003945 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003946 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003947
Alex Elder9d3997f2012-10-25 23:34:42 -05003948 rbdc = rbd_get_client(ceph_opts);
3949 if (IS_ERR(rbdc)) {
3950 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003951 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003952 }
Alex Elderc53d5892012-10-25 23:34:42 -05003953 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003954
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003955 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003956 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003957 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003958 if (rc < 0)
3959 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003960 spec->pool_id = (u64) rc;
3961
Alex Elder0903e872012-11-14 12:25:19 -06003962 /* The ceph file layout needs to fit pool id in 32 bits */
3963
3964 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
3965 rc = -EIO;
3966 goto err_out_client;
3967 }
3968
Alex Elderc53d5892012-10-25 23:34:42 -05003969 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003970 if (!rbd_dev)
3971 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05003972 rbdc = NULL; /* rbd_dev now owns this */
3973 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003974
Alex Elderbd4ba652012-10-25 23:34:42 -05003975 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05003976 kfree(rbd_opts);
3977 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05003978
Alex Eldera30b71b2012-07-10 20:30:11 -05003979 rc = rbd_dev_probe(rbd_dev);
3980 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05003981 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05003982
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003983 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05003984err_out_rbd_dev:
3985 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05003986err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05003987 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003988err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003989 if (ceph_opts)
3990 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003991 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003992 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003993err_out_module:
3994 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003995
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003996 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06003997
3998 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003999}
4000
Alex Elderde71a292012-07-03 16:01:19 -05004001static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004002{
4003 struct list_head *tmp;
4004 struct rbd_device *rbd_dev;
4005
Alex Eldere124a822012-01-29 13:57:44 -06004006 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004007 list_for_each(tmp, &rbd_dev_list) {
4008 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004009 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06004010 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004011 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06004012 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004013 }
Alex Eldere124a822012-01-29 13:57:44 -06004014 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004015 return NULL;
4016}
4017
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004018static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004019{
Alex Elder593a9e72012-02-07 12:03:37 -06004020 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004021
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004022 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004023 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004024
4025 /* clean up and free blkdev */
4026 rbd_free_disk(rbd_dev);
4027 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004028
Alex Elder2ac4e752012-07-10 20:30:10 -05004029 /* release allocated disk header fields */
4030 rbd_header_free(&rbd_dev->header);
4031
Alex Elder32eec682012-02-08 16:11:14 -06004032 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004033 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004034 rbd_assert(rbd_dev->rbd_client != NULL);
4035 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004036
4037 /* release module ref */
4038 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004039}
4040
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004041static ssize_t rbd_remove(struct bus_type *bus,
4042 const char *buf,
4043 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004044{
4045 struct rbd_device *rbd_dev = NULL;
4046 int target_id, rc;
4047 unsigned long ul;
4048 int ret = count;
4049
4050 rc = strict_strtoul(buf, 10, &ul);
4051 if (rc)
4052 return rc;
4053
4054 /* convert to int; abort if we lost anything in the conversion */
4055 target_id = (int) ul;
4056 if (target_id != ul)
4057 return -EINVAL;
4058
4059 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4060
4061 rbd_dev = __rbd_get_dev(target_id);
4062 if (!rbd_dev) {
4063 ret = -ENOENT;
4064 goto done;
4065 }
4066
Alex Elder42382b72012-11-16 09:29:16 -06004067 if (rbd_dev->open_count) {
4068 ret = -EBUSY;
4069 goto done;
4070 }
4071
Alex Elder41f38c22012-10-25 23:34:40 -05004072 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004073 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004074
4075done:
4076 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05004077
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004078 return ret;
4079}
4080
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004081/*
4082 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004083 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004084 */
4085static int rbd_sysfs_init(void)
4086{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004087 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004088
Alex Elderfed4c142012-02-07 12:03:36 -06004089 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004090 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004091 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004092
Alex Elderfed4c142012-02-07 12:03:36 -06004093 ret = bus_register(&rbd_bus_type);
4094 if (ret < 0)
4095 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004096
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004097 return ret;
4098}
4099
4100static void rbd_sysfs_cleanup(void)
4101{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004102 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06004103 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004104}
4105
4106int __init rbd_init(void)
4107{
4108 int rc;
4109
4110 rc = rbd_sysfs_init();
4111 if (rc)
4112 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06004113 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004114 return 0;
4115}
4116
4117void __exit rbd_exit(void)
4118{
4119 rbd_sysfs_cleanup();
4120}
4121
4122module_init(rbd_init);
4123module_exit(rbd_exit);
4124
4125MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4126MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4127MODULE_DESCRIPTION("rados block device");
4128
4129/* following authorship retained from original osdblk.c */
4130MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4131
4132MODULE_LICENSE("GPL");