blob: 76917cc3e5a1e24bacf0532cdf401ea24d582604 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
Alex Elderbf0d5f502012-11-22 00:00:08 -0600165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
Alex Elder9969ebc2013-01-18 12:31:10 -0600173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600176
177struct rbd_obj_request {
178 const char *object_name;
179 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */
181
182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
199 s32 result;
200 atomic_t done;
201
202 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600203 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204
205 struct kref kref;
206};
207
208struct rbd_img_request {
209 struct request *rq;
210 struct rbd_device *rbd_dev;
211 u64 offset; /* starting image byte offset */
212 u64 length; /* byte count from offset */
213 bool write_request; /* false for read */
214 union {
215 struct ceph_snap_context *snapc; /* for writes */
216 u64 snap_id; /* for reads */
217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
226};
227
228#define for_each_obj_request(ireq, oreq) \
229 list_for_each_entry(oreq, &ireq->obj_requests, links)
230#define for_each_obj_request_from(ireq, oreq) \
231 list_for_each_entry_from(oreq, &ireq->obj_requests, links)
232#define for_each_obj_request_safe(ireq, oreq, n) \
233 list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235struct rbd_snap {
236 struct device dev;
237 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800238 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239 struct list_head node;
240 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500241 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800242};
243
Alex Elderf84344f2012-08-31 17:29:51 -0500244struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500245 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500246 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500247 bool read_only;
248};
249
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250/*
251 * a single device
252 */
253struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500254 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255
256 int major; /* blkdev assigned major */
257 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258
Alex Eldera30b71b2012-07-10 20:30:11 -0500259 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260 struct rbd_client *rbd_client;
261
262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263
264 spinlock_t lock; /* queue lock */
265
266 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600267 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500268 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500270 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500271
Alex Elder0903e872012-11-14 12:25:19 -0600272 struct ceph_file_layout layout;
273
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700274 struct ceph_osd_event *watch_event;
275 struct ceph_osd_request *watch_request;
276
Alex Elder86b00e02012-10-25 23:34:42 -0500277 struct rbd_spec *parent_spec;
278 u64 parent_overlap;
279
Josh Durginc6666012011-11-21 17:11:12 -0800280 /* protects updating the header */
281 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500282
283 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284
285 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800286
287 /* list of snapshots */
288 struct list_head snaps;
289
290 /* sysfs related */
291 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600292 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800293};
294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600296
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600298static DEFINE_SPINLOCK(rbd_dev_list_lock);
299
Alex Elder432b8582012-01-29 13:57:44 -0600300static LIST_HEAD(rbd_client_list); /* clients */
301static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302
Alex Elder304f6802012-08-31 17:29:52 -0500303static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
304static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
305
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800306static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500307static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800308
Alex Elderf0f8cef2012-01-29 13:57:44 -0600309static ssize_t rbd_add(struct bus_type *bus, const char *buf,
310 size_t count);
311static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
312 size_t count);
313
314static struct bus_attribute rbd_bus_attrs[] = {
315 __ATTR(add, S_IWUSR, NULL, rbd_add),
316 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
317 __ATTR_NULL
318};
319
320static struct bus_type rbd_bus_type = {
321 .name = "rbd",
322 .bus_attrs = rbd_bus_attrs,
323};
324
325static void rbd_root_dev_release(struct device *dev)
326{
327}
328
329static struct device rbd_root_dev = {
330 .init_name = "rbd",
331 .release = rbd_root_dev_release,
332};
333
Alex Elder06ecc6c2012-11-01 10:17:15 -0500334static __printf(2, 3)
335void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
336{
337 struct va_format vaf;
338 va_list args;
339
340 va_start(args, fmt);
341 vaf.fmt = fmt;
342 vaf.va = &args;
343
344 if (!rbd_dev)
345 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
346 else if (rbd_dev->disk)
347 printk(KERN_WARNING "%s: %s: %pV\n",
348 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
349 else if (rbd_dev->spec && rbd_dev->spec->image_name)
350 printk(KERN_WARNING "%s: image %s: %pV\n",
351 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
352 else if (rbd_dev->spec && rbd_dev->spec->image_id)
353 printk(KERN_WARNING "%s: id %s: %pV\n",
354 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
355 else /* punt */
356 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
357 RBD_DRV_NAME, rbd_dev, &vaf);
358 va_end(args);
359}
360
Alex Elderaafb230e2012-09-06 16:00:54 -0500361#ifdef RBD_DEBUG
362#define rbd_assert(expr) \
363 if (unlikely(!(expr))) { \
364 printk(KERN_ERR "\nAssertion failure in %s() " \
365 "at line %d:\n\n" \
366 "\trbd_assert(%s);\n\n", \
367 __func__, __LINE__, #expr); \
368 BUG(); \
369 }
370#else /* !RBD_DEBUG */
371# define rbd_assert(expr) ((void) 0)
372#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800373
Alex Elder117973f2012-08-31 17:29:55 -0500374static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
375static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700376
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377static int rbd_open(struct block_device *bdev, fmode_t mode)
378{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600379 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380
Alex Elderf84344f2012-08-31 17:29:51 -0500381 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382 return -EROFS;
383
Alex Elder42382b72012-11-16 09:29:16 -0600384 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600385 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500386 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600387 rbd_dev->open_count++;
388 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700389
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700390 return 0;
391}
392
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800393static int rbd_release(struct gendisk *disk, fmode_t mode)
394{
395 struct rbd_device *rbd_dev = disk->private_data;
396
Alex Elder42382b72012-11-16 09:29:16 -0600397 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
398 rbd_assert(rbd_dev->open_count > 0);
399 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600400 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600401 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800402
403 return 0;
404}
405
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406static const struct block_device_operations rbd_bd_ops = {
407 .owner = THIS_MODULE,
408 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800409 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410};
411
412/*
413 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500414 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700415 */
Alex Elderf8c38922012-08-10 13:12:07 -0700416static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417{
418 struct rbd_client *rbdc;
419 int ret = -ENOMEM;
420
421 dout("rbd_client_create\n");
422 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
423 if (!rbdc)
424 goto out_opt;
425
426 kref_init(&rbdc->kref);
427 INIT_LIST_HEAD(&rbdc->node);
428
Alex Elderbc534d82012-01-29 13:57:44 -0600429 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
430
Alex Elder43ae4702012-07-03 16:01:18 -0500431 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600433 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500434 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435
436 ret = ceph_open_session(rbdc->client);
437 if (ret < 0)
438 goto out_err;
439
Alex Elder432b8582012-01-29 13:57:44 -0600440 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600442 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443
Alex Elderbc534d82012-01-29 13:57:44 -0600444 mutex_unlock(&ctl_mutex);
445
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446 dout("rbd_client_create created %p\n", rbdc);
447 return rbdc;
448
449out_err:
450 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600451out_mutex:
452 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 kfree(rbdc);
454out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500455 if (ceph_opts)
456 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400457 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458}
459
460/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700461 * Find a ceph client with specific addr and configuration. If
462 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700464static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465{
466 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700467 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468
Alex Elder43ae4702012-07-03 16:01:18 -0500469 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470 return NULL;
471
Alex Elder1f7ba332012-08-10 13:12:07 -0700472 spin_lock(&rbd_client_list_lock);
473 list_for_each_entry(client_node, &rbd_client_list, node) {
474 if (!ceph_compare_options(ceph_opts, client_node->client)) {
475 kref_get(&client_node->kref);
476 found = true;
477 break;
478 }
479 }
480 spin_unlock(&rbd_client_list_lock);
481
482 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483}
484
485/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700486 * mount options
487 */
488enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700489 Opt_last_int,
490 /* int args above */
491 Opt_last_string,
492 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700493 Opt_read_only,
494 Opt_read_write,
495 /* Boolean args above */
496 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700497};
498
Alex Elder43ae4702012-07-03 16:01:18 -0500499static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700500 /* int args above */
501 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500502 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700503 {Opt_read_only, "ro"}, /* Alternate spelling */
504 {Opt_read_write, "read_write"},
505 {Opt_read_write, "rw"}, /* Alternate spelling */
506 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700507 {-1, NULL}
508};
509
Alex Elder98571b52013-01-20 14:44:42 -0600510struct rbd_options {
511 bool read_only;
512};
513
514#define RBD_READ_ONLY_DEFAULT false
515
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700516static int parse_rbd_opts_token(char *c, void *private)
517{
Alex Elder43ae4702012-07-03 16:01:18 -0500518 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700519 substring_t argstr[MAX_OPT_ARGS];
520 int token, intval, ret;
521
Alex Elder43ae4702012-07-03 16:01:18 -0500522 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700523 if (token < 0)
524 return -EINVAL;
525
526 if (token < Opt_last_int) {
527 ret = match_int(&argstr[0], &intval);
528 if (ret < 0) {
529 pr_err("bad mount option arg (not int) "
530 "at '%s'\n", c);
531 return ret;
532 }
533 dout("got int token %d val %d\n", token, intval);
534 } else if (token > Opt_last_int && token < Opt_last_string) {
535 dout("got string token %d val %s\n", token,
536 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700537 } else if (token > Opt_last_string && token < Opt_last_bool) {
538 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700539 } else {
540 dout("got token %d\n", token);
541 }
542
543 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700544 case Opt_read_only:
545 rbd_opts->read_only = true;
546 break;
547 case Opt_read_write:
548 rbd_opts->read_only = false;
549 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700550 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500551 rbd_assert(false);
552 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700553 }
554 return 0;
555}
556
557/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558 * Get a ceph client with specific addr and configuration, if one does
559 * not exist create it.
560 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500561static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562{
Alex Elderf8c38922012-08-10 13:12:07 -0700563 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700564
Alex Elder1f7ba332012-08-10 13:12:07 -0700565 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500566 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500567 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500568 else
Alex Elderf8c38922012-08-10 13:12:07 -0700569 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570
Alex Elder9d3997f2012-10-25 23:34:42 -0500571 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700572}
573
574/*
575 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600576 *
Alex Elder432b8582012-01-29 13:57:44 -0600577 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 */
579static void rbd_client_release(struct kref *kref)
580{
581 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
582
583 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500584 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500586 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587
588 ceph_destroy_client(rbdc->client);
589 kfree(rbdc);
590}
591
592/*
593 * Drop reference to ceph client node. If it's not referenced anymore, release
594 * it.
595 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500596static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597{
Alex Elderc53d5892012-10-25 23:34:42 -0500598 if (rbdc)
599 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600}
601
Alex Eldera30b71b2012-07-10 20:30:11 -0500602static bool rbd_image_format_valid(u32 image_format)
603{
604 return image_format == 1 || image_format == 2;
605}
606
Alex Elder8e94af82012-07-25 09:32:40 -0500607static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
608{
Alex Elder103a1502012-08-02 11:29:45 -0500609 size_t size;
610 u32 snap_count;
611
612 /* The header has to start with the magic rbd header text */
613 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
614 return false;
615
Alex Elderdb2388b2012-10-20 22:17:27 -0500616 /* The bio layer requires at least sector-sized I/O */
617
618 if (ondisk->options.order < SECTOR_SHIFT)
619 return false;
620
621 /* If we use u64 in a few spots we may be able to loosen this */
622
623 if (ondisk->options.order > 8 * sizeof (int) - 1)
624 return false;
625
Alex Elder103a1502012-08-02 11:29:45 -0500626 /*
627 * The size of a snapshot header has to fit in a size_t, and
628 * that limits the number of snapshots.
629 */
630 snap_count = le32_to_cpu(ondisk->snap_count);
631 size = SIZE_MAX - sizeof (struct ceph_snap_context);
632 if (snap_count > size / sizeof (__le64))
633 return false;
634
635 /*
636 * Not only that, but the size of the entire the snapshot
637 * header must also be representable in a size_t.
638 */
639 size -= snap_count * sizeof (__le64);
640 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
641 return false;
642
643 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500644}
645
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646/*
647 * Create a new header structure, translate header format from the on-disk
648 * header.
649 */
650static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500651 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652{
Alex Elderccece232012-07-10 20:30:10 -0500653 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500654 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500655 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500656 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657
Alex Elder6a523252012-07-19 17:12:59 -0500658 memset(header, 0, sizeof (*header));
659
Alex Elder103a1502012-08-02 11:29:45 -0500660 snap_count = le32_to_cpu(ondisk->snap_count);
661
Alex Elder58c17b02012-08-23 23:22:06 -0500662 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
663 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500664 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500666 memcpy(header->object_prefix, ondisk->object_prefix, len);
667 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600668
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500670 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
671
Alex Elder621901d2012-08-23 23:22:06 -0500672 /* Save a copy of the snapshot names */
673
Alex Elderf785cc12012-08-23 23:22:06 -0500674 if (snap_names_len > (u64) SIZE_MAX)
675 return -EIO;
676 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500678 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500679 /*
680 * Note that rbd_dev_v1_header_read() guarantees
681 * the ondisk buffer we're working with has
682 * snap_names_len bytes beyond the end of the
683 * snapshot id array, this memcpy() is safe.
684 */
685 memcpy(header->snap_names, &ondisk->snaps[snap_count],
686 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500687
Alex Elder621901d2012-08-23 23:22:06 -0500688 /* Record each snapshot's size */
689
Alex Elderd2bb24e2012-07-26 23:37:14 -0500690 size = snap_count * sizeof (*header->snap_sizes);
691 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500693 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500694 for (i = 0; i < snap_count; i++)
695 header->snap_sizes[i] =
696 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 } else {
Alex Elderccece232012-07-10 20:30:10 -0500698 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 header->snap_names = NULL;
700 header->snap_sizes = NULL;
701 }
Alex Elder849b4262012-07-09 21:04:24 -0500702
Alex Elder34b13182012-07-13 20:35:12 -0500703 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704 header->obj_order = ondisk->options.order;
705 header->crypt_type = ondisk->options.crypt_type;
706 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500707
Alex Elder621901d2012-08-23 23:22:06 -0500708 /* Allocate and fill in the snapshot context */
709
Alex Elderf84344f2012-08-31 17:29:51 -0500710 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500711 size = sizeof (struct ceph_snap_context);
712 size += snap_count * sizeof (header->snapc->snaps[0]);
713 header->snapc = kzalloc(size, GFP_KERNEL);
714 if (!header->snapc)
715 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
717 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500718 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500720 for (i = 0; i < snap_count; i++)
721 header->snapc->snaps[i] =
722 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723
724 return 0;
725
Alex Elder6a523252012-07-19 17:12:59 -0500726out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500727 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500728 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500730 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500731 kfree(header->object_prefix);
732 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500733
Alex Elder00f1f362012-02-07 12:03:36 -0600734 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735}
736
Alex Elder9e15b772012-10-30 19:40:33 -0500737static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
738{
739 struct rbd_snap *snap;
740
741 if (snap_id == CEPH_NOSNAP)
742 return RBD_SNAP_HEAD_NAME;
743
744 list_for_each_entry(snap, &rbd_dev->snaps, node)
745 if (snap_id == snap->id)
746 return snap->name;
747
748 return NULL;
749}
750
Alex Elder8836b992012-08-30 14:42:15 -0500751static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700752{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753
Alex Eldere86924a2012-07-10 20:30:11 -0500754 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600755
Alex Eldere86924a2012-07-10 20:30:11 -0500756 list_for_each_entry(snap, &rbd_dev->snaps, node) {
757 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500758 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500759 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500760 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600761
Alex Eldere86924a2012-07-10 20:30:11 -0500762 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600763 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700764 }
Alex Eldere86924a2012-07-10 20:30:11 -0500765
Alex Elder00f1f362012-02-07 12:03:36 -0600766 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767}
768
Alex Elder819d52b2012-10-25 23:34:41 -0500769static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770{
Alex Elder78dc4472012-07-19 08:49:18 -0500771 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700772
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500773 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800774 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500775 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500776 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500777 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500778 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500780 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700781 if (ret < 0)
782 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500783 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 }
Alex Elderd78b6502012-11-09 08:43:15 -0600785 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 return ret;
788}
789
790static void rbd_header_free(struct rbd_image_header *header)
791{
Alex Elder849b4262012-07-09 21:04:24 -0500792 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500793 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700794 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500795 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500796 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500797 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800798 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500799 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700800}
801
Alex Elder98571b52013-01-20 14:44:42 -0600802static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803{
Alex Elder65ccfe22012-08-09 10:33:26 -0700804 char *name;
805 u64 segment;
806 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807
Alex Elder2fd82b92012-11-09 15:05:54 -0600808 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700809 if (!name)
810 return NULL;
811 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600812 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700813 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600814 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700815 pr_err("error formatting segment name for #%llu (%d)\n",
816 segment, ret);
817 kfree(name);
818 name = NULL;
819 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820
Alex Elder65ccfe22012-08-09 10:33:26 -0700821 return name;
822}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700823
Alex Elder65ccfe22012-08-09 10:33:26 -0700824static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
825{
826 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700827
Alex Elder65ccfe22012-08-09 10:33:26 -0700828 return offset & (segment_size - 1);
829}
830
831static u64 rbd_segment_length(struct rbd_device *rbd_dev,
832 u64 offset, u64 length)
833{
834 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
835
836 offset &= segment_size - 1;
837
Alex Elderaafb230e2012-09-06 16:00:54 -0500838 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700839 if (offset + length > segment_size)
840 length = segment_size - offset;
841
842 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700843}
844
845/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700846 * returns the size of an object in the image
847 */
848static u64 rbd_obj_bytes(struct rbd_image_header *header)
849{
850 return 1 << header->obj_order;
851}
852
853/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854 * bio helpers
855 */
856
857static void bio_chain_put(struct bio *chain)
858{
859 struct bio *tmp;
860
861 while (chain) {
862 tmp = chain;
863 chain = chain->bi_next;
864 bio_put(tmp);
865 }
866}
867
868/*
869 * zeros a bio chain, starting at specific offset
870 */
871static void zero_bio_chain(struct bio *chain, int start_ofs)
872{
873 struct bio_vec *bv;
874 unsigned long flags;
875 void *buf;
876 int i;
877 int pos = 0;
878
879 while (chain) {
880 bio_for_each_segment(bv, chain, i) {
881 if (pos + bv->bv_len > start_ofs) {
882 int remainder = max(start_ofs - pos, 0);
883 buf = bvec_kmap_irq(bv, &flags);
884 memset(buf + remainder, 0,
885 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200886 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 }
888 pos += bv->bv_len;
889 }
890
891 chain = chain->bi_next;
892 }
893}
894
895/*
Alex Elderf7760da2012-10-20 22:17:27 -0500896 * Clone a portion of a bio, starting at the given byte offset
897 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898 */
Alex Elderf7760da2012-10-20 22:17:27 -0500899static struct bio *bio_clone_range(struct bio *bio_src,
900 unsigned int offset,
901 unsigned int len,
902 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903{
Alex Elderf7760da2012-10-20 22:17:27 -0500904 struct bio_vec *bv;
905 unsigned int resid;
906 unsigned short idx;
907 unsigned int voff;
908 unsigned short end_idx;
909 unsigned short vcnt;
910 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911
Alex Elderf7760da2012-10-20 22:17:27 -0500912 /* Handle the easy case for the caller */
913
914 if (!offset && len == bio_src->bi_size)
915 return bio_clone(bio_src, gfpmask);
916
917 if (WARN_ON_ONCE(!len))
918 return NULL;
919 if (WARN_ON_ONCE(len > bio_src->bi_size))
920 return NULL;
921 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
922 return NULL;
923
924 /* Find first affected segment... */
925
926 resid = offset;
927 __bio_for_each_segment(bv, bio_src, idx, 0) {
928 if (resid < bv->bv_len)
929 break;
930 resid -= bv->bv_len;
931 }
932 voff = resid;
933
934 /* ...and the last affected segment */
935
936 resid += len;
937 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
938 if (resid <= bv->bv_len)
939 break;
940 resid -= bv->bv_len;
941 }
942 vcnt = end_idx - idx + 1;
943
944 /* Build the clone */
945
946 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
947 if (!bio)
948 return NULL; /* ENOMEM */
949
950 bio->bi_bdev = bio_src->bi_bdev;
951 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
952 bio->bi_rw = bio_src->bi_rw;
953 bio->bi_flags |= 1 << BIO_CLONED;
954
955 /*
956 * Copy over our part of the bio_vec, then update the first
957 * and last (or only) entries.
958 */
959 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
960 vcnt * sizeof (struct bio_vec));
961 bio->bi_io_vec[0].bv_offset += voff;
962 if (vcnt > 1) {
963 bio->bi_io_vec[0].bv_len -= voff;
964 bio->bi_io_vec[vcnt - 1].bv_len = resid;
965 } else {
966 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 }
968
Alex Elderf7760da2012-10-20 22:17:27 -0500969 bio->bi_vcnt = vcnt;
970 bio->bi_size = len;
971 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700972
Alex Elderf7760da2012-10-20 22:17:27 -0500973 return bio;
974}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Alex Elderf7760da2012-10-20 22:17:27 -0500976/*
977 * Clone a portion of a bio chain, starting at the given byte offset
978 * into the first bio in the source chain and continuing for the
979 * number of bytes indicated. The result is another bio chain of
980 * exactly the given length, or a null pointer on error.
981 *
982 * The bio_src and offset parameters are both in-out. On entry they
983 * refer to the first source bio and the offset into that bio where
984 * the start of data to be cloned is located.
985 *
986 * On return, bio_src is updated to refer to the bio in the source
987 * chain that contains first un-cloned byte, and *offset will
988 * contain the offset of that byte within that bio.
989 */
990static struct bio *bio_chain_clone_range(struct bio **bio_src,
991 unsigned int *offset,
992 unsigned int len,
993 gfp_t gfpmask)
994{
995 struct bio *bi = *bio_src;
996 unsigned int off = *offset;
997 struct bio *chain = NULL;
998 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Alex Elderf7760da2012-10-20 22:17:27 -05001000 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001001
Alex Elderf7760da2012-10-20 22:17:27 -05001002 if (!bi || off >= bi->bi_size || !len)
1003 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004
Alex Elderf7760da2012-10-20 22:17:27 -05001005 end = &chain;
1006 while (len) {
1007 unsigned int bi_size;
1008 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001009
Alex Elderf5400b72012-11-01 10:17:15 -05001010 if (!bi) {
1011 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001012 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001013 }
Alex Elderf7760da2012-10-20 22:17:27 -05001014 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1015 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1016 if (!bio)
1017 goto out_err; /* ENOMEM */
1018
1019 *end = bio;
1020 end = &bio->bi_next;
1021
1022 off += bi_size;
1023 if (off == bi->bi_size) {
1024 bi = bi->bi_next;
1025 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026 }
Alex Elderf7760da2012-10-20 22:17:27 -05001027 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028 }
Alex Elderf7760da2012-10-20 22:17:27 -05001029 *bio_src = bi;
1030 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031
Alex Elderf7760da2012-10-20 22:17:27 -05001032 return chain;
1033out_err:
1034 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036 return NULL;
1037}
1038
Alex Elderbf0d5f502012-11-22 00:00:08 -06001039static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1040{
1041 kref_get(&obj_request->kref);
1042}
1043
1044static void rbd_obj_request_destroy(struct kref *kref);
1045static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1046{
1047 rbd_assert(obj_request != NULL);
1048 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1049}
1050
1051static void rbd_img_request_get(struct rbd_img_request *img_request)
1052{
1053 kref_get(&img_request->kref);
1054}
1055
1056static void rbd_img_request_destroy(struct kref *kref);
1057static void rbd_img_request_put(struct rbd_img_request *img_request)
1058{
1059 rbd_assert(img_request != NULL);
1060 kref_put(&img_request->kref, rbd_img_request_destroy);
1061}
1062
1063static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1064 struct rbd_obj_request *obj_request)
1065{
1066 rbd_obj_request_get(obj_request);
1067 obj_request->img_request = img_request;
1068 list_add_tail(&obj_request->links, &img_request->obj_requests);
1069 obj_request->which = img_request->obj_request_count++;
1070 rbd_assert(obj_request->which != BAD_WHICH);
1071}
1072
1073static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1074 struct rbd_obj_request *obj_request)
1075{
1076 rbd_assert(obj_request->which != BAD_WHICH);
1077 obj_request->which = BAD_WHICH;
1078 list_del(&obj_request->links);
1079 rbd_assert(obj_request->img_request == img_request);
1080 obj_request->callback = NULL;
1081 obj_request->img_request = NULL;
1082 rbd_obj_request_put(obj_request);
1083}
1084
1085static bool obj_request_type_valid(enum obj_request_type type)
1086{
1087 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001088 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001089 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001090 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001091 return true;
1092 default:
1093 return false;
1094 }
1095}
1096
Alex Elder8d23bf22012-11-19 22:55:21 -06001097struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1098{
1099 struct ceph_osd_req_op *op;
1100 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001101 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001102
1103 op = kzalloc(sizeof (*op), GFP_NOIO);
1104 if (!op)
1105 return NULL;
1106 op->op = opcode;
1107 va_start(args, opcode);
1108 switch (opcode) {
1109 case CEPH_OSD_OP_READ:
1110 case CEPH_OSD_OP_WRITE:
1111 /* rbd_osd_req_op_create(READ, offset, length) */
1112 /* rbd_osd_req_op_create(WRITE, offset, length) */
1113 op->extent.offset = va_arg(args, u64);
1114 op->extent.length = va_arg(args, u64);
1115 if (opcode == CEPH_OSD_OP_WRITE)
1116 op->payload_len = op->extent.length;
1117 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001118 case CEPH_OSD_OP_CALL:
1119 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1120 op->cls.class_name = va_arg(args, char *);
1121 size = strlen(op->cls.class_name);
1122 rbd_assert(size <= (size_t) U8_MAX);
1123 op->cls.class_len = size;
1124 op->payload_len = size;
1125
1126 op->cls.method_name = va_arg(args, char *);
1127 size = strlen(op->cls.method_name);
1128 rbd_assert(size <= (size_t) U8_MAX);
1129 op->cls.method_len = size;
1130 op->payload_len += size;
1131
1132 op->cls.argc = 0;
1133 op->cls.indata = va_arg(args, void *);
1134 size = va_arg(args, size_t);
1135 rbd_assert(size <= (size_t) U32_MAX);
1136 op->cls.indata_len = (u32) size;
1137 op->payload_len += size;
1138 break;
Alex Elder5efea492012-11-19 22:55:21 -06001139 case CEPH_OSD_OP_NOTIFY_ACK:
1140 case CEPH_OSD_OP_WATCH:
1141 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1142 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1143 op->watch.cookie = va_arg(args, u64);
1144 op->watch.ver = va_arg(args, u64);
1145 op->watch.ver = cpu_to_le64(op->watch.ver);
1146 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1147 op->watch.flag = (u8) 1;
1148 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001149 default:
1150 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1151 kfree(op);
1152 op = NULL;
1153 break;
1154 }
1155 va_end(args);
1156
1157 return op;
1158}
1159
1160static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1161{
1162 kfree(op);
1163}
1164
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165/*
1166 * Send ceph osd request
1167 */
1168static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001169 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170 struct ceph_snap_context *snapc,
1171 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001172 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173 struct bio *bio,
1174 struct page **pages,
1175 int num_pages,
1176 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001177 struct ceph_osd_req_op *op,
Alex Elder5f29ddd2012-11-08 08:01:39 -06001178 void (*rbd_cb)(struct ceph_osd_request *,
1179 struct ceph_msg *),
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001180 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181{
Alex Elder1dbb4392012-01-24 10:08:37 -06001182 struct ceph_osd_client *osdc;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001183 struct ceph_osd_request *osd_req;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001184 struct timespec mtime = CURRENT_TIME;
1185 int ret;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001186
Alex Elder7d250b92012-11-30 17:53:04 -06001187 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n",
Alex Elderf7760da2012-10-20 22:17:27 -05001188 object_name, (unsigned long long) ofs,
Alex Elder7d250b92012-11-30 17:53:04 -06001189 (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001190
Alex Elder0ce1a792012-07-03 16:01:18 -05001191 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder30573d62012-11-13 21:11:15 -06001192 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001193 if (!osd_req)
1194 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001195
Alex Elderd178a9e2012-11-13 21:11:15 -06001196 osd_req->r_flags = flags;
Alex Elder54a54002012-11-13 21:11:15 -06001197 osd_req->r_pages = pages;
1198 if (bio) {
1199 osd_req->r_bio = bio;
1200 bio_get(osd_req->r_bio);
1201 }
Alex Elder2e53c6c2012-11-30 09:59:47 -06001202
Alex Elder5f29ddd2012-11-08 08:01:39 -06001203 osd_req->r_callback = rbd_cb;
Alex Elder7d250b92012-11-30 17:53:04 -06001204 osd_req->r_priv = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205
Alex Elder5f29ddd2012-11-08 08:01:39 -06001206 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1207 osd_req->r_oid_len = strlen(osd_req->r_oid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208
Alex Elder0903e872012-11-14 12:25:19 -06001209 osd_req->r_file_layout = rbd_dev->layout; /* struct */
Alex Eldere01e7922012-11-14 12:25:18 -06001210 osd_req->r_num_pages = calc_pages_for(ofs, len);
1211 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212
Alex Elder30573d62012-11-13 21:11:15 -06001213 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
Alex Elderae7ca4a32012-11-13 21:11:15 -06001214 snapc, snapid, &mtime);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215
Alex Elder8b84de72012-11-20 14:17:17 -06001216 if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001217 ceph_osdc_set_request_linger(osdc, osd_req);
Alex Elder8b84de72012-11-20 14:17:17 -06001218 rbd_dev->watch_request = osd_req;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001219 }
1220
Alex Elder5f29ddd2012-11-08 08:01:39 -06001221 ret = ceph_osdc_start_request(osdc, osd_req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222 if (ret < 0)
1223 goto done_err;
1224
1225 if (!rbd_cb) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001226 u64 version;
1227
1228 ret = ceph_osdc_wait_request(osdc, osd_req);
1229 version = le64_to_cpu(osd_req->r_reassert_version.version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 if (ver)
Alex Elder5f29ddd2012-11-08 08:01:39 -06001231 *ver = version;
1232 dout("reassert_ver=%llu\n", (unsigned long long) version);
1233 ceph_osdc_put_request(osd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 }
1235 return ret;
1236
1237done_err:
Alex Elder2e53c6c2012-11-30 09:59:47 -06001238 if (bio)
1239 bio_chain_put(osd_req->r_bio);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001240 ceph_osdc_put_request(osd_req);
1241
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242 return ret;
1243}
1244
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001245/*
1246 * Do a synchronous ceph osd operation
1247 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001248static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001249 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001250 struct ceph_osd_req_op *op,
Alex Elderaded07e2012-07-03 16:01:18 -05001251 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001252 u64 ofs, u64 inbound_size,
1253 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001254 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255{
1256 int ret;
1257 struct page **pages;
1258 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001259
Alex Elder30573d62012-11-13 21:11:15 -06001260 rbd_assert(op != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261
Alex Elderf8d4de62012-07-03 16:01:19 -05001262 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001263 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001264 if (IS_ERR(pages))
1265 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001266
Alex Elder25704ac2012-11-09 08:43:16 -06001267 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderf8d4de62012-07-03 16:01:19 -05001268 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001269 pages, num_pages,
1270 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001271 op,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001272 NULL,
Alex Elder8b84de72012-11-20 14:17:17 -06001273 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001274 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001275 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001276
Alex Elderf8d4de62012-07-03 16:01:19 -05001277 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1278 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001279
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001280done:
1281 ceph_release_page_vector(pages, num_pages);
1282 return ret;
1283}
1284
Alex Elderbf0d5f502012-11-22 00:00:08 -06001285static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1286 struct rbd_obj_request *obj_request)
1287{
1288 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1289}
1290
1291static void rbd_img_request_complete(struct rbd_img_request *img_request)
1292{
1293 if (img_request->callback)
1294 img_request->callback(img_request);
1295 else
1296 rbd_img_request_put(img_request);
1297}
1298
Alex Elder788e2df2013-01-17 12:25:27 -06001299/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1300
1301static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1302{
1303 return wait_for_completion_interruptible(&obj_request->completion);
1304}
1305
Alex Elder9969ebc2013-01-18 12:31:10 -06001306static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
1307 struct ceph_osd_op *op)
1308{
1309 atomic_set(&obj_request->done, 1);
1310}
1311
Alex Elderbf0d5f502012-11-22 00:00:08 -06001312static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1313{
1314 if (obj_request->callback)
1315 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001316 else
1317 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001318}
1319
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001320/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001321 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001322 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001323static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001324 const char *object_name,
1325 const char *class_name,
1326 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001327 const char *outbound,
1328 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001329 char *inbound,
1330 size_t inbound_size,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001331 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001332{
Alex Elder139b4312012-11-13 21:11:15 -06001333 struct ceph_osd_req_op *op;
Alex Elder57cfc102012-06-26 12:57:03 -07001334 int ret;
1335
Alex Elder3cb4a682012-06-26 12:57:03 -07001336 /*
1337 * Any input parameters required by the method we're calling
1338 * will be sent along with the class and method names as
1339 * part of the message payload. That data and its size are
1340 * supplied via the indata and indata_len fields (named from
1341 * the perspective of the server side) in the OSD request
1342 * operation.
1343 */
Alex Elder2647ba32012-11-19 22:55:21 -06001344 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1345 method_name, outbound, outbound_size);
Alex Elder139b4312012-11-13 21:11:15 -06001346 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001347 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001348
Alex Elder30573d62012-11-13 21:11:15 -06001349 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
Alex Elderf8d4de62012-07-03 16:01:19 -05001350 object_name, 0, inbound_size, inbound,
Alex Elder8b84de72012-11-20 14:17:17 -06001351 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001352
Alex Elder2647ba32012-11-19 22:55:21 -06001353 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001354
1355 dout("cls_exec returned %d\n", ret);
1356 return ret;
1357}
1358
Alex Elderbf0d5f502012-11-22 00:00:08 -06001359static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1360 struct ceph_osd_op *op)
1361{
1362 u64 xferred;
1363
1364 /*
1365 * We support a 64-bit length, but ultimately it has to be
1366 * passed to blk_end_request(), which takes an unsigned int.
1367 */
1368 xferred = le64_to_cpu(op->extent.length);
1369 rbd_assert(xferred < (u64) UINT_MAX);
1370 if (obj_request->result == (s32) -ENOENT) {
1371 zero_bio_chain(obj_request->bio_list, 0);
1372 obj_request->result = 0;
1373 } else if (xferred < obj_request->length && !obj_request->result) {
1374 zero_bio_chain(obj_request->bio_list, xferred);
1375 xferred = obj_request->length;
1376 }
1377 obj_request->xferred = xferred;
1378 atomic_set(&obj_request->done, 1);
1379}
1380
1381static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1382 struct ceph_osd_op *op)
1383{
1384 obj_request->xferred = le64_to_cpu(op->extent.length);
1385 atomic_set(&obj_request->done, 1);
1386}
1387
1388static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1389 struct ceph_msg *msg)
1390{
1391 struct rbd_obj_request *obj_request = osd_req->r_priv;
1392 struct ceph_osd_reply_head *reply_head;
1393 struct ceph_osd_op *op;
1394 u32 num_ops;
1395 u16 opcode;
1396
1397 rbd_assert(osd_req == obj_request->osd_req);
1398 rbd_assert(!!obj_request->img_request ^
1399 (obj_request->which == BAD_WHICH));
1400
1401 obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1402 reply_head = msg->front.iov_base;
1403 obj_request->result = (s32) le32_to_cpu(reply_head->result);
1404 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1405
1406 num_ops = le32_to_cpu(reply_head->num_ops);
1407 WARN_ON(num_ops != 1); /* For now */
1408
1409 op = &reply_head->ops[0];
1410 opcode = le16_to_cpu(op->op);
1411 switch (opcode) {
1412 case CEPH_OSD_OP_READ:
1413 rbd_osd_read_callback(obj_request, op);
1414 break;
1415 case CEPH_OSD_OP_WRITE:
1416 rbd_osd_write_callback(obj_request, op);
1417 break;
Alex Elderb8d70032012-11-30 17:53:04 -06001418 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001419 case CEPH_OSD_OP_WATCH:
1420 rbd_osd_trivial_callback(obj_request, op);
1421 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001422 default:
1423 rbd_warn(NULL, "%s: unsupported op %hu\n",
1424 obj_request->object_name, (unsigned short) opcode);
1425 break;
1426 }
1427
1428 if (atomic_read(&obj_request->done))
1429 rbd_obj_request_complete(obj_request);
1430}
1431
1432static struct ceph_osd_request *rbd_osd_req_create(
1433 struct rbd_device *rbd_dev,
1434 bool write_request,
1435 struct rbd_obj_request *obj_request,
1436 struct ceph_osd_req_op *op)
1437{
1438 struct rbd_img_request *img_request = obj_request->img_request;
1439 struct ceph_snap_context *snapc = NULL;
1440 struct ceph_osd_client *osdc;
1441 struct ceph_osd_request *osd_req;
1442 struct timespec now;
1443 struct timespec *mtime;
1444 u64 snap_id = CEPH_NOSNAP;
1445 u64 offset = obj_request->offset;
1446 u64 length = obj_request->length;
1447
1448 if (img_request) {
1449 rbd_assert(img_request->write_request == write_request);
1450 if (img_request->write_request)
1451 snapc = img_request->snapc;
1452 else
1453 snap_id = img_request->snap_id;
1454 }
1455
1456 /* Allocate and initialize the request, for the single op */
1457
1458 osdc = &rbd_dev->rbd_client->client->osdc;
1459 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1460 if (!osd_req)
1461 return NULL; /* ENOMEM */
1462
1463 rbd_assert(obj_request_type_valid(obj_request->type));
1464 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001465 case OBJ_REQUEST_NODATA:
1466 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001467 case OBJ_REQUEST_BIO:
1468 rbd_assert(obj_request->bio_list != NULL);
1469 osd_req->r_bio = obj_request->bio_list;
1470 bio_get(osd_req->r_bio);
1471 /* osd client requires "num pages" even for bio */
1472 osd_req->r_num_pages = calc_pages_for(offset, length);
1473 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001474 case OBJ_REQUEST_PAGES:
1475 osd_req->r_pages = obj_request->pages;
1476 osd_req->r_num_pages = obj_request->page_count;
1477 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1478 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001479 }
1480
1481 if (write_request) {
1482 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1483 now = CURRENT_TIME;
1484 mtime = &now;
1485 } else {
1486 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1487 mtime = NULL; /* not needed for reads */
1488 offset = 0; /* These are not used... */
1489 length = 0; /* ...for osd read requests */
1490 }
1491
1492 osd_req->r_callback = rbd_osd_req_callback;
1493 osd_req->r_priv = obj_request;
1494
1495 osd_req->r_oid_len = strlen(obj_request->object_name);
1496 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1497 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1498
1499 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1500
1501 /* osd_req will get its own reference to snapc (if non-null) */
1502
1503 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1504 snapc, snap_id, mtime);
1505
1506 return osd_req;
1507}
1508
1509static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1510{
1511 ceph_osdc_put_request(osd_req);
1512}
1513
1514/* object_name is assumed to be a non-null pointer and NUL-terminated */
1515
1516static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1517 u64 offset, u64 length,
1518 enum obj_request_type type)
1519{
1520 struct rbd_obj_request *obj_request;
1521 size_t size;
1522 char *name;
1523
1524 rbd_assert(obj_request_type_valid(type));
1525
1526 size = strlen(object_name) + 1;
1527 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1528 if (!obj_request)
1529 return NULL;
1530
1531 name = (char *)(obj_request + 1);
1532 obj_request->object_name = memcpy(name, object_name, size);
1533 obj_request->offset = offset;
1534 obj_request->length = length;
1535 obj_request->which = BAD_WHICH;
1536 obj_request->type = type;
1537 INIT_LIST_HEAD(&obj_request->links);
1538 atomic_set(&obj_request->done, 0);
Alex Elder788e2df2013-01-17 12:25:27 -06001539 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001540 kref_init(&obj_request->kref);
1541
1542 return obj_request;
1543}
1544
1545static void rbd_obj_request_destroy(struct kref *kref)
1546{
1547 struct rbd_obj_request *obj_request;
1548
1549 obj_request = container_of(kref, struct rbd_obj_request, kref);
1550
1551 rbd_assert(obj_request->img_request == NULL);
1552 rbd_assert(obj_request->which == BAD_WHICH);
1553
1554 if (obj_request->osd_req)
1555 rbd_osd_req_destroy(obj_request->osd_req);
1556
1557 rbd_assert(obj_request_type_valid(obj_request->type));
1558 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001559 case OBJ_REQUEST_NODATA:
1560 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001561 case OBJ_REQUEST_BIO:
1562 if (obj_request->bio_list)
1563 bio_chain_put(obj_request->bio_list);
1564 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001565 case OBJ_REQUEST_PAGES:
1566 if (obj_request->pages)
1567 ceph_release_page_vector(obj_request->pages,
1568 obj_request->page_count);
1569 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001570 }
1571
1572 kfree(obj_request);
1573}
1574
1575/*
1576 * Caller is responsible for filling in the list of object requests
1577 * that comprises the image request, and the Linux request pointer
1578 * (if there is one).
1579 */
1580struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1581 u64 offset, u64 length,
1582 bool write_request)
1583{
1584 struct rbd_img_request *img_request;
1585 struct ceph_snap_context *snapc = NULL;
1586
1587 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1588 if (!img_request)
1589 return NULL;
1590
1591 if (write_request) {
1592 down_read(&rbd_dev->header_rwsem);
1593 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1594 up_read(&rbd_dev->header_rwsem);
1595 if (WARN_ON(!snapc)) {
1596 kfree(img_request);
1597 return NULL; /* Shouldn't happen */
1598 }
1599 }
1600
1601 img_request->rq = NULL;
1602 img_request->rbd_dev = rbd_dev;
1603 img_request->offset = offset;
1604 img_request->length = length;
1605 img_request->write_request = write_request;
1606 if (write_request)
1607 img_request->snapc = snapc;
1608 else
1609 img_request->snap_id = rbd_dev->spec->snap_id;
1610 spin_lock_init(&img_request->completion_lock);
1611 img_request->next_completion = 0;
1612 img_request->callback = NULL;
1613 img_request->obj_request_count = 0;
1614 INIT_LIST_HEAD(&img_request->obj_requests);
1615 kref_init(&img_request->kref);
1616
1617 rbd_img_request_get(img_request); /* Avoid a warning */
1618 rbd_img_request_put(img_request); /* TEMPORARY */
1619
1620 return img_request;
1621}
1622
1623static void rbd_img_request_destroy(struct kref *kref)
1624{
1625 struct rbd_img_request *img_request;
1626 struct rbd_obj_request *obj_request;
1627 struct rbd_obj_request *next_obj_request;
1628
1629 img_request = container_of(kref, struct rbd_img_request, kref);
1630
1631 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1632 rbd_img_obj_request_del(img_request, obj_request);
1633
1634 if (img_request->write_request)
1635 ceph_put_snap_context(img_request->snapc);
1636
1637 kfree(img_request);
1638}
1639
1640static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1641 struct bio *bio_list)
1642{
1643 struct rbd_device *rbd_dev = img_request->rbd_dev;
1644 struct rbd_obj_request *obj_request = NULL;
1645 struct rbd_obj_request *next_obj_request;
1646 unsigned int bio_offset;
1647 u64 image_offset;
1648 u64 resid;
1649 u16 opcode;
1650
1651 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1652 : CEPH_OSD_OP_READ;
1653 bio_offset = 0;
1654 image_offset = img_request->offset;
1655 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1656 resid = img_request->length;
1657 while (resid) {
1658 const char *object_name;
1659 unsigned int clone_size;
1660 struct ceph_osd_req_op *op;
1661 u64 offset;
1662 u64 length;
1663
1664 object_name = rbd_segment_name(rbd_dev, image_offset);
1665 if (!object_name)
1666 goto out_unwind;
1667 offset = rbd_segment_offset(rbd_dev, image_offset);
1668 length = rbd_segment_length(rbd_dev, image_offset, resid);
1669 obj_request = rbd_obj_request_create(object_name,
1670 offset, length,
1671 OBJ_REQUEST_BIO);
1672 kfree(object_name); /* object request has its own copy */
1673 if (!obj_request)
1674 goto out_unwind;
1675
1676 rbd_assert(length <= (u64) UINT_MAX);
1677 clone_size = (unsigned int) length;
1678 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1679 &bio_offset, clone_size,
1680 GFP_ATOMIC);
1681 if (!obj_request->bio_list)
1682 goto out_partial;
1683
1684 /*
1685 * Build up the op to use in building the osd
1686 * request. Note that the contents of the op are
1687 * copied by rbd_osd_req_create().
1688 */
1689 op = rbd_osd_req_op_create(opcode, offset, length);
1690 if (!op)
1691 goto out_partial;
1692 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1693 img_request->write_request,
1694 obj_request, op);
1695 rbd_osd_req_op_destroy(op);
1696 if (!obj_request->osd_req)
1697 goto out_partial;
1698 /* status and version are initially zero-filled */
1699
1700 rbd_img_obj_request_add(img_request, obj_request);
1701
1702 image_offset += length;
1703 resid -= length;
1704 }
1705
1706 return 0;
1707
1708out_partial:
1709 rbd_obj_request_put(obj_request);
1710out_unwind:
1711 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1712 rbd_obj_request_put(obj_request);
1713
1714 return -ENOMEM;
1715}
1716
1717static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1718{
1719 struct rbd_img_request *img_request;
1720 u32 which = obj_request->which;
1721 bool more = true;
1722
1723 img_request = obj_request->img_request;
1724 rbd_assert(img_request != NULL);
1725 rbd_assert(img_request->rq != NULL);
1726 rbd_assert(which != BAD_WHICH);
1727 rbd_assert(which < img_request->obj_request_count);
1728 rbd_assert(which >= img_request->next_completion);
1729
1730 spin_lock_irq(&img_request->completion_lock);
1731 if (which != img_request->next_completion)
1732 goto out;
1733
1734 for_each_obj_request_from(img_request, obj_request) {
1735 unsigned int xferred;
1736 int result;
1737
1738 rbd_assert(more);
1739 rbd_assert(which < img_request->obj_request_count);
1740
1741 if (!atomic_read(&obj_request->done))
1742 break;
1743
1744 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1745 xferred = (unsigned int) obj_request->xferred;
1746 result = (int) obj_request->result;
1747 if (result)
1748 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1749 img_request->write_request ? "write" : "read",
1750 result, xferred);
1751
1752 more = blk_end_request(img_request->rq, result, xferred);
1753 which++;
1754 }
1755 rbd_assert(more ^ (which == img_request->obj_request_count));
1756 img_request->next_completion = which;
1757out:
1758 spin_unlock_irq(&img_request->completion_lock);
1759
1760 if (!more)
1761 rbd_img_request_complete(img_request);
1762}
1763
1764static int rbd_img_request_submit(struct rbd_img_request *img_request)
1765{
1766 struct rbd_device *rbd_dev = img_request->rbd_dev;
1767 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1768 struct rbd_obj_request *obj_request;
1769
1770 for_each_obj_request(img_request, obj_request) {
1771 int ret;
1772
1773 obj_request->callback = rbd_img_obj_callback;
1774 ret = rbd_obj_request_submit(osdc, obj_request);
1775 if (ret)
1776 return ret;
1777 /*
1778 * The image request has its own reference to each
1779 * of its object requests, so we can safely drop the
1780 * initial one here.
1781 */
1782 rbd_obj_request_put(obj_request);
1783 }
1784
1785 return 0;
1786}
1787
Alex Eldercf81b602013-01-17 12:18:46 -06001788static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06001789 u64 ver, u64 notify_id)
1790{
1791 struct rbd_obj_request *obj_request;
1792 struct ceph_osd_req_op *op;
1793 struct ceph_osd_client *osdc;
1794 int ret;
1795
1796 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1797 OBJ_REQUEST_NODATA);
1798 if (!obj_request)
1799 return -ENOMEM;
1800
1801 ret = -ENOMEM;
1802 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1803 if (!op)
1804 goto out;
1805 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1806 obj_request, op);
1807 rbd_osd_req_op_destroy(op);
1808 if (!obj_request->osd_req)
1809 goto out;
1810
1811 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Eldercf81b602013-01-17 12:18:46 -06001812 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06001813 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001814out:
Alex Eldercf81b602013-01-17 12:18:46 -06001815 if (ret)
1816 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001817
1818 return ret;
1819}
1820
1821static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1822{
1823 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1824 u64 hver;
1825 int rc;
1826
1827 if (!rbd_dev)
1828 return;
1829
1830 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1831 rbd_dev->header_name, (unsigned long long) notify_id,
1832 (unsigned int) opcode);
1833 rc = rbd_dev_refresh(rbd_dev, &hver);
1834 if (rc)
1835 rbd_warn(rbd_dev, "got notification but failed to "
1836 " update snaps: %d\n", rc);
1837
Alex Eldercf81b602013-01-17 12:18:46 -06001838 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06001839}
1840
Alex Elder9969ebc2013-01-18 12:31:10 -06001841/*
1842 * Request sync osd watch/unwatch. The value of "start" determines
1843 * whether a watch request is being initiated or torn down.
1844 */
1845static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1846{
1847 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1848 struct rbd_obj_request *obj_request;
1849 struct ceph_osd_req_op *op;
1850 int ret;
1851
1852 rbd_assert(start ^ !!rbd_dev->watch_event);
1853 rbd_assert(start ^ !!rbd_dev->watch_request);
1854
1855 if (start) {
1856 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1857 &rbd_dev->watch_event);
1858 if (ret < 0)
1859 return ret;
1860 }
1861
1862 ret = -ENOMEM;
1863 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1864 OBJ_REQUEST_NODATA);
1865 if (!obj_request)
1866 goto out_cancel;
1867
1868 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1869 rbd_dev->watch_event->cookie,
1870 rbd_dev->header.obj_version, start);
1871 if (!op)
1872 goto out_cancel;
1873 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1874 obj_request, op);
1875 rbd_osd_req_op_destroy(op);
1876 if (!obj_request->osd_req)
1877 goto out_cancel;
1878
1879 if (start) {
1880 rbd_dev->watch_request = obj_request->osd_req;
1881 ceph_osdc_set_request_linger(osdc, rbd_dev->watch_request);
1882 }
1883 ret = rbd_obj_request_submit(osdc, obj_request);
1884 if (ret)
1885 goto out_cancel;
1886 ret = rbd_obj_request_wait(obj_request);
1887 if (ret)
1888 goto out_cancel;
1889
1890 ret = obj_request->result;
1891 if (ret)
1892 goto out_cancel;
1893
1894 if (start)
1895 goto done; /* Done if setting up the watch request */
1896out_cancel:
1897 /* Cancel the event if we're tearing down, or on error */
1898 ceph_osdc_cancel_event(rbd_dev->watch_event);
1899 rbd_dev->watch_event = NULL;
1900done:
1901 if (obj_request)
1902 rbd_obj_request_put(obj_request);
1903
1904 return ret;
1905}
1906
Alex Elderbf0d5f502012-11-22 00:00:08 -06001907static void rbd_request_fn(struct request_queue *q)
1908{
1909 struct rbd_device *rbd_dev = q->queuedata;
1910 bool read_only = rbd_dev->mapping.read_only;
1911 struct request *rq;
1912 int result;
1913
1914 while ((rq = blk_fetch_request(q))) {
1915 bool write_request = rq_data_dir(rq) == WRITE;
1916 struct rbd_img_request *img_request;
1917 u64 offset;
1918 u64 length;
1919
1920 /* Ignore any non-FS requests that filter through. */
1921
1922 if (rq->cmd_type != REQ_TYPE_FS) {
1923 __blk_end_request_all(rq, 0);
1924 continue;
1925 }
1926
1927 spin_unlock_irq(q->queue_lock);
1928
1929 /* Disallow writes to a read-only device */
1930
1931 if (write_request) {
1932 result = -EROFS;
1933 if (read_only)
1934 goto end_request;
1935 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1936 }
1937
1938 /* Quit early if the snapshot has disappeared */
1939
1940 if (!atomic_read(&rbd_dev->exists)) {
1941 dout("request for non-existent snapshot");
1942 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1943 result = -ENXIO;
1944 goto end_request;
1945 }
1946
1947 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1948 length = (u64) blk_rq_bytes(rq);
1949
1950 result = -EINVAL;
1951 if (WARN_ON(offset && length > U64_MAX - offset + 1))
1952 goto end_request; /* Shouldn't happen */
1953
1954 result = -ENOMEM;
1955 img_request = rbd_img_request_create(rbd_dev, offset, length,
1956 write_request);
1957 if (!img_request)
1958 goto end_request;
1959
1960 img_request->rq = rq;
1961
1962 result = rbd_img_request_fill_bio(img_request, rq->bio);
1963 if (!result)
1964 result = rbd_img_request_submit(img_request);
1965 if (result)
1966 rbd_img_request_put(img_request);
1967end_request:
1968 spin_lock_irq(q->queue_lock);
1969 if (result < 0) {
1970 rbd_warn(rbd_dev, "obj_request %s result %d\n",
1971 write_request ? "write" : "read", result);
1972 __blk_end_request_all(rq, result);
1973 }
1974 }
1975}
1976
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001977/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001978 * a queue callback. Makes sure that we don't create a bio that spans across
1979 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001980 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001981 */
1982static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1983 struct bio_vec *bvec)
1984{
1985 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001986 sector_t sector_offset;
1987 sector_t sectors_per_obj;
1988 sector_t obj_sector_offset;
1989 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001990
Alex Eldere5cfeed22012-10-20 22:17:27 -05001991 /*
1992 * Find how far into its rbd object the partition-relative
1993 * bio start sector is to offset relative to the enclosing
1994 * device.
1995 */
1996 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1997 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1998 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001999
Alex Eldere5cfeed22012-10-20 22:17:27 -05002000 /*
2001 * Compute the number of bytes from that offset to the end
2002 * of the object. Account for what's already used by the bio.
2003 */
2004 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2005 if (ret > bmd->bi_size)
2006 ret -= bmd->bi_size;
2007 else
2008 ret = 0;
2009
2010 /*
2011 * Don't send back more than was asked for. And if the bio
2012 * was empty, let the whole thing through because: "Note
2013 * that a block device *must* allow a single page to be
2014 * added to an empty bio."
2015 */
2016 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2017 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2018 ret = (int) bvec->bv_len;
2019
2020 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002021}
2022
2023static void rbd_free_disk(struct rbd_device *rbd_dev)
2024{
2025 struct gendisk *disk = rbd_dev->disk;
2026
2027 if (!disk)
2028 return;
2029
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002030 if (disk->flags & GENHD_FL_UP)
2031 del_gendisk(disk);
2032 if (disk->queue)
2033 blk_cleanup_queue(disk->queue);
2034 put_disk(disk);
2035}
2036
Alex Elder788e2df2013-01-17 12:25:27 -06002037static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2038 const char *object_name,
2039 u64 offset, u64 length,
2040 char *buf, u64 *version)
2041
2042{
2043 struct ceph_osd_req_op *op;
2044 struct rbd_obj_request *obj_request;
2045 struct ceph_osd_client *osdc;
2046 struct page **pages = NULL;
2047 u32 page_count;
2048 int ret;
2049
2050 page_count = (u32) calc_pages_for(offset, length);
2051 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2052 if (IS_ERR(pages))
2053 ret = PTR_ERR(pages);
2054
2055 ret = -ENOMEM;
2056 obj_request = rbd_obj_request_create(object_name, offset, length,
2057 OBJ_REQUEST_PAGES);
2058 if (!obj_request)
2059 goto out;
2060
2061 obj_request->pages = pages;
2062 obj_request->page_count = page_count;
2063
2064 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2065 if (!op)
2066 goto out;
2067 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2068 obj_request, op);
2069 rbd_osd_req_op_destroy(op);
2070 if (!obj_request->osd_req)
2071 goto out;
2072
2073 osdc = &rbd_dev->rbd_client->client->osdc;
2074 ret = rbd_obj_request_submit(osdc, obj_request);
2075 if (ret)
2076 goto out;
2077 ret = rbd_obj_request_wait(obj_request);
2078 if (ret)
2079 goto out;
2080
2081 ret = obj_request->result;
2082 if (ret < 0)
2083 goto out;
2084 ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
2085 if (version)
2086 *version = obj_request->version;
2087out:
2088 if (obj_request)
2089 rbd_obj_request_put(obj_request);
2090 else
2091 ceph_release_page_vector(pages, page_count);
2092
2093 return ret;
2094}
2095
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002096/*
Alex Elder4156d992012-08-02 11:29:46 -05002097 * Read the complete header for the given rbd device.
2098 *
2099 * Returns a pointer to a dynamically-allocated buffer containing
2100 * the complete and validated header. Caller can pass the address
2101 * of a variable that will be filled in with the version of the
2102 * header object at the time it was read.
2103 *
2104 * Returns a pointer-coded errno if a failure occurs.
2105 */
2106static struct rbd_image_header_ondisk *
2107rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2108{
2109 struct rbd_image_header_ondisk *ondisk = NULL;
2110 u32 snap_count = 0;
2111 u64 names_size = 0;
2112 u32 want_count;
2113 int ret;
2114
2115 /*
2116 * The complete header will include an array of its 64-bit
2117 * snapshot ids, followed by the names of those snapshots as
2118 * a contiguous block of NUL-terminated strings. Note that
2119 * the number of snapshots could change by the time we read
2120 * it in, in which case we re-read it.
2121 */
2122 do {
2123 size_t size;
2124
2125 kfree(ondisk);
2126
2127 size = sizeof (*ondisk);
2128 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2129 size += names_size;
2130 ondisk = kmalloc(size, GFP_KERNEL);
2131 if (!ondisk)
2132 return ERR_PTR(-ENOMEM);
2133
Alex Elder788e2df2013-01-17 12:25:27 -06002134 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002135 0, size,
2136 (char *) ondisk, version);
2137
2138 if (ret < 0)
2139 goto out_err;
2140 if (WARN_ON((size_t) ret < size)) {
2141 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002142 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2143 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002144 goto out_err;
2145 }
2146 if (!rbd_dev_ondisk_valid(ondisk)) {
2147 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002148 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002149 goto out_err;
2150 }
2151
2152 names_size = le64_to_cpu(ondisk->snap_names_len);
2153 want_count = snap_count;
2154 snap_count = le32_to_cpu(ondisk->snap_count);
2155 } while (snap_count != want_count);
2156
2157 return ondisk;
2158
2159out_err:
2160 kfree(ondisk);
2161
2162 return ERR_PTR(ret);
2163}
2164
2165/*
2166 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002167 */
2168static int rbd_read_header(struct rbd_device *rbd_dev,
2169 struct rbd_image_header *header)
2170{
Alex Elder4156d992012-08-02 11:29:46 -05002171 struct rbd_image_header_ondisk *ondisk;
2172 u64 ver = 0;
2173 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002174
Alex Elder4156d992012-08-02 11:29:46 -05002175 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2176 if (IS_ERR(ondisk))
2177 return PTR_ERR(ondisk);
2178 ret = rbd_header_from_disk(header, ondisk);
2179 if (ret >= 0)
2180 header->obj_version = ver;
2181 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002182
Alex Elder4156d992012-08-02 11:29:46 -05002183 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002184}
2185
Alex Elder41f38c22012-10-25 23:34:40 -05002186static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002187{
2188 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002189 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002190
Alex Eldera0593292012-07-19 09:09:27 -05002191 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002192 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002193}
2194
Alex Elder94785542012-10-09 13:50:17 -07002195static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2196{
2197 sector_t size;
2198
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002199 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002200 return;
2201
2202 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2203 dout("setting size to %llu sectors", (unsigned long long) size);
2204 rbd_dev->mapping.size = (u64) size;
2205 set_capacity(rbd_dev->disk, size);
2206}
2207
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002208/*
2209 * only read the first part of the ondisk header, without the snaps info
2210 */
Alex Elder117973f2012-08-31 17:29:55 -05002211static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002212{
2213 int ret;
2214 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002215
2216 ret = rbd_read_header(rbd_dev, &h);
2217 if (ret < 0)
2218 return ret;
2219
Josh Durgina51aa0c2011-12-05 10:35:04 -08002220 down_write(&rbd_dev->header_rwsem);
2221
Alex Elder94785542012-10-09 13:50:17 -07002222 /* Update image size, and check for resize of mapped image */
2223 rbd_dev->header.image_size = h.image_size;
2224 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07002225
Alex Elder849b4262012-07-09 21:04:24 -05002226 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002227 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05002228 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08002229 /* osd requests may still refer to snapc */
2230 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002231
Alex Elderb8136232012-07-25 09:32:41 -05002232 if (hver)
2233 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08002234 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08002235 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002236 rbd_dev->header.snapc = h.snapc;
2237 rbd_dev->header.snap_names = h.snap_names;
2238 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05002239 /* Free the extra copy of the object prefix */
2240 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2241 kfree(h.object_prefix);
2242
Alex Elder304f6802012-08-31 17:29:52 -05002243 ret = rbd_dev_snaps_update(rbd_dev);
2244 if (!ret)
2245 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002246
Josh Durginc6666012011-11-21 17:11:12 -08002247 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002248
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002249 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002250}
2251
Alex Elder117973f2012-08-31 17:29:55 -05002252static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05002253{
2254 int ret;
2255
Alex Elder117973f2012-08-31 17:29:55 -05002256 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05002257 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05002258 if (rbd_dev->image_format == 1)
2259 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2260 else
2261 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05002262 mutex_unlock(&ctl_mutex);
2263
2264 return ret;
2265}
2266
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002267static int rbd_init_disk(struct rbd_device *rbd_dev)
2268{
2269 struct gendisk *disk;
2270 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06002271 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002272
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002273 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002274 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2275 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002276 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002277
Alex Elderf0f8cef2012-01-29 13:57:44 -06002278 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05002279 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002280 disk->major = rbd_dev->major;
2281 disk->first_minor = 0;
2282 disk->fops = &rbd_bd_ops;
2283 disk->private_data = rbd_dev;
2284
Alex Elderbf0d5f502012-11-22 00:00:08 -06002285 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002286 if (!q)
2287 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07002288
Alex Elder593a9e72012-02-07 12:03:37 -06002289 /* We use the default size, but let's be explicit about it. */
2290 blk_queue_physical_block_size(q, SECTOR_SIZE);
2291
Josh Durgin029bcbd2011-07-22 11:35:23 -07002292 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06002293 segment_size = rbd_obj_bytes(&rbd_dev->header);
2294 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2295 blk_queue_max_segment_size(q, segment_size);
2296 blk_queue_io_min(q, segment_size);
2297 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07002298
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002299 blk_queue_merge_bvec(q, rbd_merge_bvec);
2300 disk->queue = q;
2301
2302 q->queuedata = rbd_dev;
2303
2304 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002305
Alex Elder12f02942012-08-29 17:11:07 -05002306 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2307
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002308 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002309out_disk:
2310 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002311
2312 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002313}
2314
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002315/*
2316 sysfs
2317*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002318
Alex Elder593a9e72012-02-07 12:03:37 -06002319static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2320{
2321 return container_of(dev, struct rbd_device, dev);
2322}
2323
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002324static ssize_t rbd_size_show(struct device *dev,
2325 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002326{
Alex Elder593a9e72012-02-07 12:03:37 -06002327 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08002328 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002329
Josh Durgina51aa0c2011-12-05 10:35:04 -08002330 down_read(&rbd_dev->header_rwsem);
2331 size = get_capacity(rbd_dev->disk);
2332 up_read(&rbd_dev->header_rwsem);
2333
2334 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002335}
2336
Alex Elder34b13182012-07-13 20:35:12 -05002337/*
2338 * Note this shows the features for whatever's mapped, which is not
2339 * necessarily the base image.
2340 */
2341static ssize_t rbd_features_show(struct device *dev,
2342 struct device_attribute *attr, char *buf)
2343{
2344 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2345
2346 return sprintf(buf, "0x%016llx\n",
2347 (unsigned long long) rbd_dev->mapping.features);
2348}
2349
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002350static ssize_t rbd_major_show(struct device *dev,
2351 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002352{
Alex Elder593a9e72012-02-07 12:03:37 -06002353 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002354
2355 return sprintf(buf, "%d\n", rbd_dev->major);
2356}
2357
2358static ssize_t rbd_client_id_show(struct device *dev,
2359 struct device_attribute *attr, char *buf)
2360{
Alex Elder593a9e72012-02-07 12:03:37 -06002361 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002362
Alex Elder1dbb4392012-01-24 10:08:37 -06002363 return sprintf(buf, "client%lld\n",
2364 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002365}
2366
2367static ssize_t rbd_pool_show(struct device *dev,
2368 struct device_attribute *attr, char *buf)
2369{
Alex Elder593a9e72012-02-07 12:03:37 -06002370 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002371
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002372 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002373}
2374
Alex Elder9bb2f332012-07-12 10:46:35 -05002375static ssize_t rbd_pool_id_show(struct device *dev,
2376 struct device_attribute *attr, char *buf)
2377{
2378 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2379
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002380 return sprintf(buf, "%llu\n",
2381 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002382}
2383
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002384static ssize_t rbd_name_show(struct device *dev,
2385 struct device_attribute *attr, char *buf)
2386{
Alex Elder593a9e72012-02-07 12:03:37 -06002387 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002388
Alex Eldera92ffdf2012-10-30 19:40:33 -05002389 if (rbd_dev->spec->image_name)
2390 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2391
2392 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002393}
2394
Alex Elder589d30e2012-07-10 20:30:11 -05002395static ssize_t rbd_image_id_show(struct device *dev,
2396 struct device_attribute *attr, char *buf)
2397{
2398 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2399
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002400 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002401}
2402
Alex Elder34b13182012-07-13 20:35:12 -05002403/*
2404 * Shows the name of the currently-mapped snapshot (or
2405 * RBD_SNAP_HEAD_NAME for the base image).
2406 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002407static ssize_t rbd_snap_show(struct device *dev,
2408 struct device_attribute *attr,
2409 char *buf)
2410{
Alex Elder593a9e72012-02-07 12:03:37 -06002411 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002412
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002413 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002414}
2415
Alex Elder86b00e02012-10-25 23:34:42 -05002416/*
2417 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2418 * for the parent image. If there is no parent, simply shows
2419 * "(no parent image)".
2420 */
2421static ssize_t rbd_parent_show(struct device *dev,
2422 struct device_attribute *attr,
2423 char *buf)
2424{
2425 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2426 struct rbd_spec *spec = rbd_dev->parent_spec;
2427 int count;
2428 char *bufp = buf;
2429
2430 if (!spec)
2431 return sprintf(buf, "(no parent image)\n");
2432
2433 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2434 (unsigned long long) spec->pool_id, spec->pool_name);
2435 if (count < 0)
2436 return count;
2437 bufp += count;
2438
2439 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2440 spec->image_name ? spec->image_name : "(unknown)");
2441 if (count < 0)
2442 return count;
2443 bufp += count;
2444
2445 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2446 (unsigned long long) spec->snap_id, spec->snap_name);
2447 if (count < 0)
2448 return count;
2449 bufp += count;
2450
2451 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2452 if (count < 0)
2453 return count;
2454 bufp += count;
2455
2456 return (ssize_t) (bufp - buf);
2457}
2458
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002459static ssize_t rbd_image_refresh(struct device *dev,
2460 struct device_attribute *attr,
2461 const char *buf,
2462 size_t size)
2463{
Alex Elder593a9e72012-02-07 12:03:37 -06002464 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002465 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002466
Alex Elder117973f2012-08-31 17:29:55 -05002467 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002468
2469 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002470}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002471
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002472static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002473static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002474static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2475static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2476static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002477static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002478static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002479static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002480static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2481static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002482static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002483
2484static struct attribute *rbd_attrs[] = {
2485 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002486 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002487 &dev_attr_major.attr,
2488 &dev_attr_client_id.attr,
2489 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002490 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002491 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002492 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002493 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002494 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002495 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002496 NULL
2497};
2498
2499static struct attribute_group rbd_attr_group = {
2500 .attrs = rbd_attrs,
2501};
2502
2503static const struct attribute_group *rbd_attr_groups[] = {
2504 &rbd_attr_group,
2505 NULL
2506};
2507
2508static void rbd_sysfs_dev_release(struct device *dev)
2509{
2510}
2511
2512static struct device_type rbd_device_type = {
2513 .name = "rbd",
2514 .groups = rbd_attr_groups,
2515 .release = rbd_sysfs_dev_release,
2516};
2517
2518
2519/*
2520 sysfs - snapshots
2521*/
2522
2523static ssize_t rbd_snap_size_show(struct device *dev,
2524 struct device_attribute *attr,
2525 char *buf)
2526{
2527 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2528
Josh Durgin35915382011-12-05 18:25:13 -08002529 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002530}
2531
2532static ssize_t rbd_snap_id_show(struct device *dev,
2533 struct device_attribute *attr,
2534 char *buf)
2535{
2536 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2537
Josh Durgin35915382011-12-05 18:25:13 -08002538 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002539}
2540
Alex Elder34b13182012-07-13 20:35:12 -05002541static ssize_t rbd_snap_features_show(struct device *dev,
2542 struct device_attribute *attr,
2543 char *buf)
2544{
2545 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2546
2547 return sprintf(buf, "0x%016llx\n",
2548 (unsigned long long) snap->features);
2549}
2550
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002551static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2552static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002553static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002554
2555static struct attribute *rbd_snap_attrs[] = {
2556 &dev_attr_snap_size.attr,
2557 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002558 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002559 NULL,
2560};
2561
2562static struct attribute_group rbd_snap_attr_group = {
2563 .attrs = rbd_snap_attrs,
2564};
2565
2566static void rbd_snap_dev_release(struct device *dev)
2567{
2568 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2569 kfree(snap->name);
2570 kfree(snap);
2571}
2572
2573static const struct attribute_group *rbd_snap_attr_groups[] = {
2574 &rbd_snap_attr_group,
2575 NULL
2576};
2577
2578static struct device_type rbd_snap_device_type = {
2579 .groups = rbd_snap_attr_groups,
2580 .release = rbd_snap_dev_release,
2581};
2582
Alex Elder8b8fb992012-10-26 17:25:24 -05002583static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2584{
2585 kref_get(&spec->kref);
2586
2587 return spec;
2588}
2589
2590static void rbd_spec_free(struct kref *kref);
2591static void rbd_spec_put(struct rbd_spec *spec)
2592{
2593 if (spec)
2594 kref_put(&spec->kref, rbd_spec_free);
2595}
2596
2597static struct rbd_spec *rbd_spec_alloc(void)
2598{
2599 struct rbd_spec *spec;
2600
2601 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2602 if (!spec)
2603 return NULL;
2604 kref_init(&spec->kref);
2605
2606 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2607
2608 return spec;
2609}
2610
2611static void rbd_spec_free(struct kref *kref)
2612{
2613 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2614
2615 kfree(spec->pool_name);
2616 kfree(spec->image_id);
2617 kfree(spec->image_name);
2618 kfree(spec->snap_name);
2619 kfree(spec);
2620}
2621
Alex Elderc53d5892012-10-25 23:34:42 -05002622struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2623 struct rbd_spec *spec)
2624{
2625 struct rbd_device *rbd_dev;
2626
2627 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2628 if (!rbd_dev)
2629 return NULL;
2630
2631 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002632 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002633 INIT_LIST_HEAD(&rbd_dev->node);
2634 INIT_LIST_HEAD(&rbd_dev->snaps);
2635 init_rwsem(&rbd_dev->header_rwsem);
2636
2637 rbd_dev->spec = spec;
2638 rbd_dev->rbd_client = rbdc;
2639
Alex Elder0903e872012-11-14 12:25:19 -06002640 /* Initialize the layout used for all rbd requests */
2641
2642 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2643 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2644 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2645 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2646
Alex Elderc53d5892012-10-25 23:34:42 -05002647 return rbd_dev;
2648}
2649
2650static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2651{
Alex Elder86b00e02012-10-25 23:34:42 -05002652 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002653 kfree(rbd_dev->header_name);
2654 rbd_put_client(rbd_dev->rbd_client);
2655 rbd_spec_put(rbd_dev->spec);
2656 kfree(rbd_dev);
2657}
2658
Alex Elder304f6802012-08-31 17:29:52 -05002659static bool rbd_snap_registered(struct rbd_snap *snap)
2660{
2661 bool ret = snap->dev.type == &rbd_snap_device_type;
2662 bool reg = device_is_registered(&snap->dev);
2663
2664 rbd_assert(!ret ^ reg);
2665
2666 return ret;
2667}
2668
Alex Elder41f38c22012-10-25 23:34:40 -05002669static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002670{
2671 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002672 if (device_is_registered(&snap->dev))
2673 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002674}
2675
Alex Elder14e70852012-07-19 09:09:27 -05002676static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002677 struct device *parent)
2678{
2679 struct device *dev = &snap->dev;
2680 int ret;
2681
2682 dev->type = &rbd_snap_device_type;
2683 dev->parent = parent;
2684 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002685 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002686 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2687
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002688 ret = device_register(dev);
2689
2690 return ret;
2691}
2692
Alex Elder4e891e02012-07-10 20:30:10 -05002693static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002694 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002695 u64 snap_id, u64 snap_size,
2696 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002697{
Alex Elder4e891e02012-07-10 20:30:10 -05002698 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002699 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002700
2701 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002702 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002703 return ERR_PTR(-ENOMEM);
2704
2705 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002706 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002707 if (!snap->name)
2708 goto err;
2709
Alex Elderc8d18422012-07-10 20:30:11 -05002710 snap->id = snap_id;
2711 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002712 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002713
2714 return snap;
2715
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002716err:
2717 kfree(snap->name);
2718 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002719
2720 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002721}
2722
Alex Eldercd892122012-07-03 16:01:19 -05002723static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2724 u64 *snap_size, u64 *snap_features)
2725{
2726 char *snap_name;
2727
2728 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2729
2730 *snap_size = rbd_dev->header.snap_sizes[which];
2731 *snap_features = 0; /* No features for v1 */
2732
2733 /* Skip over names until we find the one we are looking for */
2734
2735 snap_name = rbd_dev->header.snap_names;
2736 while (which--)
2737 snap_name += strlen(snap_name) + 1;
2738
2739 return snap_name;
2740}
2741
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002742/*
Alex Elder9d475de2012-07-03 16:01:19 -05002743 * Get the size and object order for an image snapshot, or if
2744 * snap_id is CEPH_NOSNAP, gets this information for the base
2745 * image.
2746 */
2747static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2748 u8 *order, u64 *snap_size)
2749{
2750 __le64 snapid = cpu_to_le64(snap_id);
2751 int ret;
2752 struct {
2753 u8 order;
2754 __le64 size;
2755 } __attribute__ ((packed)) size_buf = { 0 };
2756
2757 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2758 "rbd", "get_size",
2759 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002760 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder9d475de2012-07-03 16:01:19 -05002761 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2762 if (ret < 0)
2763 return ret;
2764
2765 *order = size_buf.order;
2766 *snap_size = le64_to_cpu(size_buf.size);
2767
2768 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2769 (unsigned long long) snap_id, (unsigned int) *order,
2770 (unsigned long long) *snap_size);
2771
2772 return 0;
2773}
2774
2775static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2776{
2777 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2778 &rbd_dev->header.obj_order,
2779 &rbd_dev->header.image_size);
2780}
2781
Alex Elder1e130192012-07-03 16:01:19 -05002782static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2783{
2784 void *reply_buf;
2785 int ret;
2786 void *p;
2787
2788 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2789 if (!reply_buf)
2790 return -ENOMEM;
2791
2792 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2793 "rbd", "get_object_prefix",
2794 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002795 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder1e130192012-07-03 16:01:19 -05002796 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2797 if (ret < 0)
2798 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002799 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002800
2801 p = reply_buf;
2802 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2803 p + RBD_OBJ_PREFIX_LEN_MAX,
2804 NULL, GFP_NOIO);
2805
2806 if (IS_ERR(rbd_dev->header.object_prefix)) {
2807 ret = PTR_ERR(rbd_dev->header.object_prefix);
2808 rbd_dev->header.object_prefix = NULL;
2809 } else {
2810 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2811 }
2812
2813out:
2814 kfree(reply_buf);
2815
2816 return ret;
2817}
2818
Alex Elderb1b54022012-07-03 16:01:19 -05002819static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2820 u64 *snap_features)
2821{
2822 __le64 snapid = cpu_to_le64(snap_id);
2823 struct {
2824 __le64 features;
2825 __le64 incompat;
2826 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002827 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002828 int ret;
2829
2830 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2831 "rbd", "get_features",
2832 (char *) &snapid, sizeof (snapid),
2833 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002834 NULL);
Alex Elderb1b54022012-07-03 16:01:19 -05002835 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2836 if (ret < 0)
2837 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002838
2839 incompat = le64_to_cpu(features_buf.incompat);
2840 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002841 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002842
Alex Elderb1b54022012-07-03 16:01:19 -05002843 *snap_features = le64_to_cpu(features_buf.features);
2844
2845 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2846 (unsigned long long) snap_id,
2847 (unsigned long long) *snap_features,
2848 (unsigned long long) le64_to_cpu(features_buf.incompat));
2849
2850 return 0;
2851}
2852
2853static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2854{
2855 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2856 &rbd_dev->header.features);
2857}
2858
Alex Elder86b00e02012-10-25 23:34:42 -05002859static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2860{
2861 struct rbd_spec *parent_spec;
2862 size_t size;
2863 void *reply_buf = NULL;
2864 __le64 snapid;
2865 void *p;
2866 void *end;
2867 char *image_id;
2868 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002869 int ret;
2870
2871 parent_spec = rbd_spec_alloc();
2872 if (!parent_spec)
2873 return -ENOMEM;
2874
2875 size = sizeof (__le64) + /* pool_id */
2876 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2877 sizeof (__le64) + /* snap_id */
2878 sizeof (__le64); /* overlap */
2879 reply_buf = kmalloc(size, GFP_KERNEL);
2880 if (!reply_buf) {
2881 ret = -ENOMEM;
2882 goto out_err;
2883 }
2884
2885 snapid = cpu_to_le64(CEPH_NOSNAP);
2886 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2887 "rbd", "get_parent",
2888 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002889 (char *) reply_buf, size, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002890 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2891 if (ret < 0)
2892 goto out_err;
2893
2894 ret = -ERANGE;
2895 p = reply_buf;
2896 end = (char *) reply_buf + size;
2897 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2898 if (parent_spec->pool_id == CEPH_NOPOOL)
2899 goto out; /* No parent? No problem. */
2900
Alex Elder0903e872012-11-14 12:25:19 -06002901 /* The ceph file layout needs to fit pool id in 32 bits */
2902
2903 ret = -EIO;
2904 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2905 goto out;
2906
Alex Elder979ed482012-11-01 08:39:26 -05002907 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002908 if (IS_ERR(image_id)) {
2909 ret = PTR_ERR(image_id);
2910 goto out_err;
2911 }
2912 parent_spec->image_id = image_id;
2913 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2914 ceph_decode_64_safe(&p, end, overlap, out_err);
2915
2916 rbd_dev->parent_overlap = overlap;
2917 rbd_dev->parent_spec = parent_spec;
2918 parent_spec = NULL; /* rbd_dev now owns this */
2919out:
2920 ret = 0;
2921out_err:
2922 kfree(reply_buf);
2923 rbd_spec_put(parent_spec);
2924
2925 return ret;
2926}
2927
Alex Elder9e15b772012-10-30 19:40:33 -05002928static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2929{
2930 size_t image_id_size;
2931 char *image_id;
2932 void *p;
2933 void *end;
2934 size_t size;
2935 void *reply_buf = NULL;
2936 size_t len = 0;
2937 char *image_name = NULL;
2938 int ret;
2939
2940 rbd_assert(!rbd_dev->spec->image_name);
2941
Alex Elder69e7a022012-11-01 08:39:26 -05002942 len = strlen(rbd_dev->spec->image_id);
2943 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002944 image_id = kmalloc(image_id_size, GFP_KERNEL);
2945 if (!image_id)
2946 return NULL;
2947
2948 p = image_id;
2949 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002950 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002951
2952 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2953 reply_buf = kmalloc(size, GFP_KERNEL);
2954 if (!reply_buf)
2955 goto out;
2956
2957 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2958 "rbd", "dir_get_name",
2959 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06002960 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05002961 if (ret < 0)
2962 goto out;
2963 p = reply_buf;
2964 end = (char *) reply_buf + size;
2965 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2966 if (IS_ERR(image_name))
2967 image_name = NULL;
2968 else
2969 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2970out:
2971 kfree(reply_buf);
2972 kfree(image_id);
2973
2974 return image_name;
2975}
2976
2977/*
2978 * When a parent image gets probed, we only have the pool, image,
2979 * and snapshot ids but not the names of any of them. This call
2980 * is made later to fill in those names. It has to be done after
2981 * rbd_dev_snaps_update() has completed because some of the
2982 * information (in particular, snapshot name) is not available
2983 * until then.
2984 */
2985static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2986{
2987 struct ceph_osd_client *osdc;
2988 const char *name;
2989 void *reply_buf = NULL;
2990 int ret;
2991
2992 if (rbd_dev->spec->pool_name)
2993 return 0; /* Already have the names */
2994
2995 /* Look up the pool name */
2996
2997 osdc = &rbd_dev->rbd_client->client->osdc;
2998 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05002999 if (!name) {
3000 rbd_warn(rbd_dev, "there is no pool with id %llu",
3001 rbd_dev->spec->pool_id); /* Really a BUG() */
3002 return -EIO;
3003 }
Alex Elder9e15b772012-10-30 19:40:33 -05003004
3005 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3006 if (!rbd_dev->spec->pool_name)
3007 return -ENOMEM;
3008
3009 /* Fetch the image name; tolerate failure here */
3010
3011 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05003012 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05003013 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05003014 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05003015 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003016
3017 /* Look up the snapshot name. */
3018
3019 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3020 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003021 rbd_warn(rbd_dev, "no snapshot with id %llu",
3022 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003023 ret = -EIO;
3024 goto out_err;
3025 }
3026 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3027 if(!rbd_dev->spec->snap_name)
3028 goto out_err;
3029
3030 return 0;
3031out_err:
3032 kfree(reply_buf);
3033 kfree(rbd_dev->spec->pool_name);
3034 rbd_dev->spec->pool_name = NULL;
3035
3036 return ret;
3037}
3038
Alex Elder6e14b1a2012-07-03 16:01:19 -05003039static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003040{
3041 size_t size;
3042 int ret;
3043 void *reply_buf;
3044 void *p;
3045 void *end;
3046 u64 seq;
3047 u32 snap_count;
3048 struct ceph_snap_context *snapc;
3049 u32 i;
3050
3051 /*
3052 * We'll need room for the seq value (maximum snapshot id),
3053 * snapshot count, and array of that many snapshot ids.
3054 * For now we have a fixed upper limit on the number we're
3055 * prepared to receive.
3056 */
3057 size = sizeof (__le64) + sizeof (__le32) +
3058 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3059 reply_buf = kzalloc(size, GFP_KERNEL);
3060 if (!reply_buf)
3061 return -ENOMEM;
3062
3063 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3064 "rbd", "get_snapcontext",
3065 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003066 reply_buf, size, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003067 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3068 if (ret < 0)
3069 goto out;
3070
3071 ret = -ERANGE;
3072 p = reply_buf;
3073 end = (char *) reply_buf + size;
3074 ceph_decode_64_safe(&p, end, seq, out);
3075 ceph_decode_32_safe(&p, end, snap_count, out);
3076
3077 /*
3078 * Make sure the reported number of snapshot ids wouldn't go
3079 * beyond the end of our buffer. But before checking that,
3080 * make sure the computed size of the snapshot context we
3081 * allocate is representable in a size_t.
3082 */
3083 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3084 / sizeof (u64)) {
3085 ret = -EINVAL;
3086 goto out;
3087 }
3088 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3089 goto out;
3090
3091 size = sizeof (struct ceph_snap_context) +
3092 snap_count * sizeof (snapc->snaps[0]);
3093 snapc = kmalloc(size, GFP_KERNEL);
3094 if (!snapc) {
3095 ret = -ENOMEM;
3096 goto out;
3097 }
3098
3099 atomic_set(&snapc->nref, 1);
3100 snapc->seq = seq;
3101 snapc->num_snaps = snap_count;
3102 for (i = 0; i < snap_count; i++)
3103 snapc->snaps[i] = ceph_decode_64(&p);
3104
3105 rbd_dev->header.snapc = snapc;
3106
3107 dout(" snap context seq = %llu, snap_count = %u\n",
3108 (unsigned long long) seq, (unsigned int) snap_count);
3109
3110out:
3111 kfree(reply_buf);
3112
3113 return 0;
3114}
3115
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003116static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3117{
3118 size_t size;
3119 void *reply_buf;
3120 __le64 snap_id;
3121 int ret;
3122 void *p;
3123 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003124 char *snap_name;
3125
3126 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3127 reply_buf = kmalloc(size, GFP_KERNEL);
3128 if (!reply_buf)
3129 return ERR_PTR(-ENOMEM);
3130
3131 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3132 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3133 "rbd", "get_snapshot_name",
3134 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003135 reply_buf, size, NULL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003136 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3137 if (ret < 0)
3138 goto out;
3139
3140 p = reply_buf;
3141 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003142 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003143 if (IS_ERR(snap_name)) {
3144 ret = PTR_ERR(snap_name);
3145 goto out;
3146 } else {
3147 dout(" snap_id 0x%016llx snap_name = %s\n",
3148 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3149 }
3150 kfree(reply_buf);
3151
3152 return snap_name;
3153out:
3154 kfree(reply_buf);
3155
3156 return ERR_PTR(ret);
3157}
3158
3159static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3160 u64 *snap_size, u64 *snap_features)
3161{
Alex Eldere0b49862013-01-09 14:44:18 -06003162 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003163 u8 order;
3164 int ret;
3165
3166 snap_id = rbd_dev->header.snapc->snaps[which];
3167 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3168 if (ret)
3169 return ERR_PTR(ret);
3170 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3171 if (ret)
3172 return ERR_PTR(ret);
3173
3174 return rbd_dev_v2_snap_name(rbd_dev, which);
3175}
3176
3177static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3178 u64 *snap_size, u64 *snap_features)
3179{
3180 if (rbd_dev->image_format == 1)
3181 return rbd_dev_v1_snap_info(rbd_dev, which,
3182 snap_size, snap_features);
3183 if (rbd_dev->image_format == 2)
3184 return rbd_dev_v2_snap_info(rbd_dev, which,
3185 snap_size, snap_features);
3186 return ERR_PTR(-EINVAL);
3187}
3188
Alex Elder117973f2012-08-31 17:29:55 -05003189static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3190{
3191 int ret;
3192 __u8 obj_order;
3193
3194 down_write(&rbd_dev->header_rwsem);
3195
3196 /* Grab old order first, to see if it changes */
3197
3198 obj_order = rbd_dev->header.obj_order,
3199 ret = rbd_dev_v2_image_size(rbd_dev);
3200 if (ret)
3201 goto out;
3202 if (rbd_dev->header.obj_order != obj_order) {
3203 ret = -EIO;
3204 goto out;
3205 }
3206 rbd_update_mapping_size(rbd_dev);
3207
3208 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3209 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3210 if (ret)
3211 goto out;
3212 ret = rbd_dev_snaps_update(rbd_dev);
3213 dout("rbd_dev_snaps_update returned %d\n", ret);
3214 if (ret)
3215 goto out;
3216 ret = rbd_dev_snaps_register(rbd_dev);
3217 dout("rbd_dev_snaps_register returned %d\n", ret);
3218out:
3219 up_write(&rbd_dev->header_rwsem);
3220
3221 return ret;
3222}
3223
Alex Elder9d475de2012-07-03 16:01:19 -05003224/*
Alex Elder35938152012-08-02 11:29:46 -05003225 * Scan the rbd device's current snapshot list and compare it to the
3226 * newly-received snapshot context. Remove any existing snapshots
3227 * not present in the new snapshot context. Add a new snapshot for
3228 * any snaphots in the snapshot context not in the current list.
3229 * And verify there are no changes to snapshots we already know
3230 * about.
3231 *
3232 * Assumes the snapshots in the snapshot context are sorted by
3233 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3234 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003235 */
Alex Elder304f6802012-08-31 17:29:52 -05003236static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003237{
Alex Elder35938152012-08-02 11:29:46 -05003238 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3239 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05003240 struct list_head *head = &rbd_dev->snaps;
3241 struct list_head *links = head->next;
3242 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003243
Alex Elder9fcbb802012-08-23 23:48:49 -05003244 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05003245 while (index < snap_count || links != head) {
3246 u64 snap_id;
3247 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05003248 char *snap_name;
3249 u64 snap_size = 0;
3250 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003251
Alex Elder35938152012-08-02 11:29:46 -05003252 snap_id = index < snap_count ? snapc->snaps[index]
3253 : CEPH_NOSNAP;
3254 snap = links != head ? list_entry(links, struct rbd_snap, node)
3255 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05003256 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003257
Alex Elder35938152012-08-02 11:29:46 -05003258 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3259 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003260
Alex Elder35938152012-08-02 11:29:46 -05003261 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003262
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003263 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06003264 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05003265 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05003266 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003267 rbd_dev->spec->snap_id == snap->id ?
3268 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05003269 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003270
Alex Elder35938152012-08-02 11:29:46 -05003271 /* Done with this list entry; advance */
3272
3273 links = next;
3274 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003275 }
Alex Elder35938152012-08-02 11:29:46 -05003276
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003277 snap_name = rbd_dev_snap_info(rbd_dev, index,
3278 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05003279 if (IS_ERR(snap_name))
3280 return PTR_ERR(snap_name);
3281
Alex Elder9fcbb802012-08-23 23:48:49 -05003282 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3283 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05003284 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3285 struct rbd_snap *new_snap;
3286
3287 /* We haven't seen this snapshot before */
3288
Alex Elderc8d18422012-07-10 20:30:11 -05003289 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05003290 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05003291 if (IS_ERR(new_snap)) {
3292 int err = PTR_ERR(new_snap);
3293
3294 dout(" failed to add dev, error %d\n", err);
3295
3296 return err;
3297 }
Alex Elder35938152012-08-02 11:29:46 -05003298
3299 /* New goes before existing, or at end of list */
3300
Alex Elder9fcbb802012-08-23 23:48:49 -05003301 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05003302 if (snap)
3303 list_add_tail(&new_snap->node, &snap->node);
3304 else
Alex Elder523f3252012-08-30 00:16:37 -05003305 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05003306 } else {
3307 /* Already have this one */
3308
Alex Elder9fcbb802012-08-23 23:48:49 -05003309 dout(" already present\n");
3310
Alex Eldercd892122012-07-03 16:01:19 -05003311 rbd_assert(snap->size == snap_size);
Alex Elderaafb230e2012-09-06 16:00:54 -05003312 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05003313 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05003314
3315 /* Done with this list entry; advance */
3316
3317 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003318 }
Alex Elder35938152012-08-02 11:29:46 -05003319
3320 /* Advance to the next entry in the snapshot context */
3321
3322 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003323 }
Alex Elder9fcbb802012-08-23 23:48:49 -05003324 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003325
3326 return 0;
3327}
3328
Alex Elder304f6802012-08-31 17:29:52 -05003329/*
3330 * Scan the list of snapshots and register the devices for any that
3331 * have not already been registered.
3332 */
3333static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3334{
3335 struct rbd_snap *snap;
3336 int ret = 0;
3337
3338 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05003339 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3340 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05003341
3342 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3343 if (!rbd_snap_registered(snap)) {
3344 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3345 if (ret < 0)
3346 break;
3347 }
3348 }
3349 dout("%s: returning %d\n", __func__, ret);
3350
3351 return ret;
3352}
3353
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003354static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3355{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003356 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003357 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003358
3359 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003360
Alex Eldercd789ab2012-08-30 00:16:38 -05003361 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003362 dev->bus = &rbd_bus_type;
3363 dev->type = &rbd_device_type;
3364 dev->parent = &rbd_root_dev;
3365 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003366 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003367 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003368
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003369 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003370
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003371 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003372}
3373
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003374static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3375{
3376 device_unregister(&rbd_dev->dev);
3377}
3378
Alex Eldere2839302012-08-29 17:11:06 -05003379static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003380
3381/*
Alex Elder499afd52012-02-02 08:13:29 -06003382 * Get a unique rbd identifier for the given new rbd_dev, and add
3383 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003384 */
Alex Eldere2839302012-08-29 17:11:06 -05003385static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003386{
Alex Eldere2839302012-08-29 17:11:06 -05003387 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003388
3389 spin_lock(&rbd_dev_list_lock);
3390 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3391 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003392 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3393 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003394}
Alex Elderb7f23c32012-01-29 13:57:43 -06003395
Alex Elder1ddbe942012-01-29 13:57:44 -06003396/*
Alex Elder499afd52012-02-02 08:13:29 -06003397 * Remove an rbd_dev from the global list, and record that its
3398 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003399 */
Alex Eldere2839302012-08-29 17:11:06 -05003400static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003401{
Alex Elderd184f6b2012-01-29 13:57:44 -06003402 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003403 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003404 int max_id;
3405
Alex Elderaafb230e2012-09-06 16:00:54 -05003406 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003407
Alex Eldere2839302012-08-29 17:11:06 -05003408 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3409 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003410 spin_lock(&rbd_dev_list_lock);
3411 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003412
3413 /*
3414 * If the id being "put" is not the current maximum, there
3415 * is nothing special we need to do.
3416 */
Alex Eldere2839302012-08-29 17:11:06 -05003417 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003418 spin_unlock(&rbd_dev_list_lock);
3419 return;
3420 }
3421
3422 /*
3423 * We need to update the current maximum id. Search the
3424 * list to find out what it is. We're more likely to find
3425 * the maximum at the end, so search the list backward.
3426 */
3427 max_id = 0;
3428 list_for_each_prev(tmp, &rbd_dev_list) {
3429 struct rbd_device *rbd_dev;
3430
3431 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003432 if (rbd_dev->dev_id > max_id)
3433 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003434 }
Alex Elder499afd52012-02-02 08:13:29 -06003435 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003436
Alex Elder1ddbe942012-01-29 13:57:44 -06003437 /*
Alex Eldere2839302012-08-29 17:11:06 -05003438 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003439 * which case it now accurately reflects the new maximum.
3440 * Be careful not to overwrite the maximum value in that
3441 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003442 */
Alex Eldere2839302012-08-29 17:11:06 -05003443 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3444 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003445}
3446
Alex Eldera725f65e2012-02-02 08:13:30 -06003447/*
Alex Eldere28fff262012-02-02 08:13:30 -06003448 * Skips over white space at *buf, and updates *buf to point to the
3449 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003450 * the token (string of non-white space characters) found. Note
3451 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003452 */
3453static inline size_t next_token(const char **buf)
3454{
3455 /*
3456 * These are the characters that produce nonzero for
3457 * isspace() in the "C" and "POSIX" locales.
3458 */
3459 const char *spaces = " \f\n\r\t\v";
3460
3461 *buf += strspn(*buf, spaces); /* Find start of token */
3462
3463 return strcspn(*buf, spaces); /* Return token length */
3464}
3465
3466/*
3467 * Finds the next token in *buf, and if the provided token buffer is
3468 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003469 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3470 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003471 *
3472 * Returns the length of the token found (not including the '\0').
3473 * Return value will be 0 if no token is found, and it will be >=
3474 * token_size if the token would not fit.
3475 *
Alex Elder593a9e72012-02-07 12:03:37 -06003476 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003477 * found token. Note that this occurs even if the token buffer is
3478 * too small to hold it.
3479 */
3480static inline size_t copy_token(const char **buf,
3481 char *token,
3482 size_t token_size)
3483{
3484 size_t len;
3485
3486 len = next_token(buf);
3487 if (len < token_size) {
3488 memcpy(token, *buf, len);
3489 *(token + len) = '\0';
3490 }
3491 *buf += len;
3492
3493 return len;
3494}
3495
3496/*
Alex Elderea3352f2012-07-09 21:04:23 -05003497 * Finds the next token in *buf, dynamically allocates a buffer big
3498 * enough to hold a copy of it, and copies the token into the new
3499 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3500 * that a duplicate buffer is created even for a zero-length token.
3501 *
3502 * Returns a pointer to the newly-allocated duplicate, or a null
3503 * pointer if memory for the duplicate was not available. If
3504 * the lenp argument is a non-null pointer, the length of the token
3505 * (not including the '\0') is returned in *lenp.
3506 *
3507 * If successful, the *buf pointer will be updated to point beyond
3508 * the end of the found token.
3509 *
3510 * Note: uses GFP_KERNEL for allocation.
3511 */
3512static inline char *dup_token(const char **buf, size_t *lenp)
3513{
3514 char *dup;
3515 size_t len;
3516
3517 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003518 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003519 if (!dup)
3520 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003521 *(dup + len) = '\0';
3522 *buf += len;
3523
3524 if (lenp)
3525 *lenp = len;
3526
3527 return dup;
3528}
3529
3530/*
Alex Elder859c31d2012-10-25 23:34:42 -05003531 * Parse the options provided for an "rbd add" (i.e., rbd image
3532 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3533 * and the data written is passed here via a NUL-terminated buffer.
3534 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003535 *
Alex Elder859c31d2012-10-25 23:34:42 -05003536 * The information extracted from these options is recorded in
3537 * the other parameters which return dynamically-allocated
3538 * structures:
3539 * ceph_opts
3540 * The address of a pointer that will refer to a ceph options
3541 * structure. Caller must release the returned pointer using
3542 * ceph_destroy_options() when it is no longer needed.
3543 * rbd_opts
3544 * Address of an rbd options pointer. Fully initialized by
3545 * this function; caller must release with kfree().
3546 * spec
3547 * Address of an rbd image specification pointer. Fully
3548 * initialized by this function based on parsed options.
3549 * Caller must release with rbd_spec_put().
3550 *
3551 * The options passed take this form:
3552 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3553 * where:
3554 * <mon_addrs>
3555 * A comma-separated list of one or more monitor addresses.
3556 * A monitor address is an ip address, optionally followed
3557 * by a port number (separated by a colon).
3558 * I.e.: ip1[:port1][,ip2[:port2]...]
3559 * <options>
3560 * A comma-separated list of ceph and/or rbd options.
3561 * <pool_name>
3562 * The name of the rados pool containing the rbd image.
3563 * <image_name>
3564 * The name of the image in that pool to map.
3565 * <snap_id>
3566 * An optional snapshot id. If provided, the mapping will
3567 * present data from the image at the time that snapshot was
3568 * created. The image head is used if no snapshot id is
3569 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003570 */
Alex Elder859c31d2012-10-25 23:34:42 -05003571static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003572 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003573 struct rbd_options **opts,
3574 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003575{
Alex Elderd22f76e2012-07-12 10:46:35 -05003576 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003577 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003578 const char *mon_addrs;
3579 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003580 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003581 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003582 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003583 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003584
3585 /* The first four tokens are required */
3586
Alex Elder7ef32142012-02-02 08:13:30 -06003587 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05003588 if (!len) {
3589 rbd_warn(NULL, "no monitor address(es) provided");
3590 return -EINVAL;
3591 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003592 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003593 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003594 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003595
Alex Elderdc79b112012-10-25 23:34:41 -05003596 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003597 options = dup_token(&buf, NULL);
3598 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003599 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003600 if (!*options) {
3601 rbd_warn(NULL, "no options provided");
3602 goto out_err;
3603 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003604
Alex Elder859c31d2012-10-25 23:34:42 -05003605 spec = rbd_spec_alloc();
3606 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003607 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003608
3609 spec->pool_name = dup_token(&buf, NULL);
3610 if (!spec->pool_name)
3611 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003612 if (!*spec->pool_name) {
3613 rbd_warn(NULL, "no pool name provided");
3614 goto out_err;
3615 }
Alex Eldere28fff262012-02-02 08:13:30 -06003616
Alex Elder69e7a022012-11-01 08:39:26 -05003617 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003618 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003619 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003620 if (!*spec->image_name) {
3621 rbd_warn(NULL, "no image name provided");
3622 goto out_err;
3623 }
Alex Eldere28fff262012-02-02 08:13:30 -06003624
Alex Elderf28e5652012-10-25 23:34:41 -05003625 /*
3626 * Snapshot name is optional; default is to use "-"
3627 * (indicating the head/no snapshot).
3628 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003629 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003630 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003631 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3632 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003633 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003634 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003635 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003636 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003637 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003638 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003639 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003640 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003641
Alex Elder0ddebc02012-10-25 23:34:41 -05003642 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003643
Alex Elder4e9afeb2012-10-25 23:34:41 -05003644 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3645 if (!rbd_opts)
3646 goto out_mem;
3647
3648 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003649
Alex Elder859c31d2012-10-25 23:34:42 -05003650 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003651 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003652 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003653 if (IS_ERR(copts)) {
3654 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003655 goto out_err;
3656 }
Alex Elder859c31d2012-10-25 23:34:42 -05003657 kfree(options);
3658
3659 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003660 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003661 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003662
Alex Elderdc79b112012-10-25 23:34:41 -05003663 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003664out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003665 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003666out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003667 kfree(rbd_opts);
3668 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003669 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003670
Alex Elderdc79b112012-10-25 23:34:41 -05003671 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003672}
3673
Alex Elder589d30e2012-07-10 20:30:11 -05003674/*
3675 * An rbd format 2 image has a unique identifier, distinct from the
3676 * name given to it by the user. Internally, that identifier is
3677 * what's used to specify the names of objects related to the image.
3678 *
3679 * A special "rbd id" object is used to map an rbd image name to its
3680 * id. If that object doesn't exist, then there is no v2 rbd image
3681 * with the supplied name.
3682 *
3683 * This function will record the given rbd_dev's image_id field if
3684 * it can be determined, and in that case will return 0. If any
3685 * errors occur a negative errno will be returned and the rbd_dev's
3686 * image_id field will be unchanged (and should be NULL).
3687 */
3688static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3689{
3690 int ret;
3691 size_t size;
3692 char *object_name;
3693 void *response;
3694 void *p;
3695
3696 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003697 * When probing a parent image, the image id is already
3698 * known (and the image name likely is not). There's no
3699 * need to fetch the image id again in this case.
3700 */
3701 if (rbd_dev->spec->image_id)
3702 return 0;
3703
3704 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003705 * First, see if the format 2 image id file exists, and if
3706 * so, get the image's persistent id from it.
3707 */
Alex Elder69e7a022012-11-01 08:39:26 -05003708 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003709 object_name = kmalloc(size, GFP_NOIO);
3710 if (!object_name)
3711 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003712 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003713 dout("rbd id object name is %s\n", object_name);
3714
3715 /* Response will be an encoded string, which includes a length */
3716
3717 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3718 response = kzalloc(size, GFP_NOIO);
3719 if (!response) {
3720 ret = -ENOMEM;
3721 goto out;
3722 }
3723
3724 ret = rbd_req_sync_exec(rbd_dev, object_name,
3725 "rbd", "get_id",
3726 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003727 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003728 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3729 if (ret < 0)
3730 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003731 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003732
3733 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003734 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003735 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003736 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003737 if (IS_ERR(rbd_dev->spec->image_id)) {
3738 ret = PTR_ERR(rbd_dev->spec->image_id);
3739 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003740 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003741 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003742 }
3743out:
3744 kfree(response);
3745 kfree(object_name);
3746
3747 return ret;
3748}
3749
Alex Eldera30b71b2012-07-10 20:30:11 -05003750static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3751{
3752 int ret;
3753 size_t size;
3754
3755 /* Version 1 images have no id; empty string is used */
3756
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003757 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3758 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003759 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003760
3761 /* Record the header object name for this rbd image. */
3762
Alex Elder69e7a022012-11-01 08:39:26 -05003763 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003764 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3765 if (!rbd_dev->header_name) {
3766 ret = -ENOMEM;
3767 goto out_err;
3768 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003769 sprintf(rbd_dev->header_name, "%s%s",
3770 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003771
3772 /* Populate rbd image metadata */
3773
3774 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3775 if (ret < 0)
3776 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003777
3778 /* Version 1 images have no parent (no layering) */
3779
3780 rbd_dev->parent_spec = NULL;
3781 rbd_dev->parent_overlap = 0;
3782
Alex Eldera30b71b2012-07-10 20:30:11 -05003783 rbd_dev->image_format = 1;
3784
3785 dout("discovered version 1 image, header name is %s\n",
3786 rbd_dev->header_name);
3787
3788 return 0;
3789
3790out_err:
3791 kfree(rbd_dev->header_name);
3792 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003793 kfree(rbd_dev->spec->image_id);
3794 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003795
3796 return ret;
3797}
3798
3799static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3800{
3801 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003802 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003803 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003804
3805 /*
3806 * Image id was filled in by the caller. Record the header
3807 * object name for this rbd image.
3808 */
Alex Elder979ed482012-11-01 08:39:26 -05003809 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003810 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3811 if (!rbd_dev->header_name)
3812 return -ENOMEM;
3813 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003814 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003815
3816 /* Get the size and object order for the image */
3817
3818 ret = rbd_dev_v2_image_size(rbd_dev);
3819 if (ret < 0)
3820 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003821
3822 /* Get the object prefix (a.k.a. block_name) for the image */
3823
3824 ret = rbd_dev_v2_object_prefix(rbd_dev);
3825 if (ret < 0)
3826 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003827
Alex Elderd8891402012-10-09 13:50:17 -07003828 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003829
3830 ret = rbd_dev_v2_features(rbd_dev);
3831 if (ret < 0)
3832 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003833
Alex Elder86b00e02012-10-25 23:34:42 -05003834 /* If the image supports layering, get the parent info */
3835
3836 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3837 ret = rbd_dev_v2_parent_info(rbd_dev);
3838 if (ret < 0)
3839 goto out_err;
3840 }
3841
Alex Elder6e14b1a2012-07-03 16:01:19 -05003842 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003843
Alex Elder6e14b1a2012-07-03 16:01:19 -05003844 rbd_dev->header.crypt_type = 0;
3845 rbd_dev->header.comp_type = 0;
3846
3847 /* Get the snapshot context, plus the header version */
3848
3849 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003850 if (ret)
3851 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003852 rbd_dev->header.obj_version = ver;
3853
Alex Eldera30b71b2012-07-10 20:30:11 -05003854 rbd_dev->image_format = 2;
3855
3856 dout("discovered version 2 image, header name is %s\n",
3857 rbd_dev->header_name);
3858
Alex Elder35152972012-08-31 17:29:55 -05003859 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003860out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003861 rbd_dev->parent_overlap = 0;
3862 rbd_spec_put(rbd_dev->parent_spec);
3863 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003864 kfree(rbd_dev->header_name);
3865 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003866 kfree(rbd_dev->header.object_prefix);
3867 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003868
3869 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003870}
3871
Alex Elder83a06262012-10-30 15:47:17 -05003872static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3873{
3874 int ret;
3875
3876 /* no need to lock here, as rbd_dev is not registered yet */
3877 ret = rbd_dev_snaps_update(rbd_dev);
3878 if (ret)
3879 return ret;
3880
Alex Elder9e15b772012-10-30 19:40:33 -05003881 ret = rbd_dev_probe_update_spec(rbd_dev);
3882 if (ret)
3883 goto err_out_snaps;
3884
Alex Elder83a06262012-10-30 15:47:17 -05003885 ret = rbd_dev_set_mapping(rbd_dev);
3886 if (ret)
3887 goto err_out_snaps;
3888
3889 /* generate unique id: find highest unique id, add one */
3890 rbd_dev_id_get(rbd_dev);
3891
3892 /* Fill in the device name, now that we have its id. */
3893 BUILD_BUG_ON(DEV_NAME_LEN
3894 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3895 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3896
3897 /* Get our block major device number. */
3898
3899 ret = register_blkdev(0, rbd_dev->name);
3900 if (ret < 0)
3901 goto err_out_id;
3902 rbd_dev->major = ret;
3903
3904 /* Set up the blkdev mapping. */
3905
3906 ret = rbd_init_disk(rbd_dev);
3907 if (ret)
3908 goto err_out_blkdev;
3909
3910 ret = rbd_bus_add_dev(rbd_dev);
3911 if (ret)
3912 goto err_out_disk;
3913
3914 /*
3915 * At this point cleanup in the event of an error is the job
3916 * of the sysfs code (initiated by rbd_bus_del_dev()).
3917 */
3918 down_write(&rbd_dev->header_rwsem);
3919 ret = rbd_dev_snaps_register(rbd_dev);
3920 up_write(&rbd_dev->header_rwsem);
3921 if (ret)
3922 goto err_out_bus;
3923
Alex Elder9969ebc2013-01-18 12:31:10 -06003924 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05003925 if (ret)
3926 goto err_out_bus;
3927
3928 /* Everything's ready. Announce the disk to the world. */
3929
3930 add_disk(rbd_dev->disk);
3931
3932 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3933 (unsigned long long) rbd_dev->mapping.size);
3934
3935 return ret;
3936err_out_bus:
3937 /* this will also clean up rest of rbd_dev stuff */
3938
3939 rbd_bus_del_dev(rbd_dev);
3940
3941 return ret;
3942err_out_disk:
3943 rbd_free_disk(rbd_dev);
3944err_out_blkdev:
3945 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3946err_out_id:
3947 rbd_dev_id_put(rbd_dev);
3948err_out_snaps:
3949 rbd_remove_all_snaps(rbd_dev);
3950
3951 return ret;
3952}
3953
Alex Eldera30b71b2012-07-10 20:30:11 -05003954/*
3955 * Probe for the existence of the header object for the given rbd
3956 * device. For format 2 images this includes determining the image
3957 * id.
3958 */
3959static int rbd_dev_probe(struct rbd_device *rbd_dev)
3960{
3961 int ret;
3962
3963 /*
3964 * Get the id from the image id object. If it's not a
3965 * format 2 image, we'll get ENOENT back, and we'll assume
3966 * it's a format 1 image.
3967 */
3968 ret = rbd_dev_image_id(rbd_dev);
3969 if (ret)
3970 ret = rbd_dev_v1_probe(rbd_dev);
3971 else
3972 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003973 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003974 dout("probe failed, returning %d\n", ret);
3975
Alex Elder83a06262012-10-30 15:47:17 -05003976 return ret;
3977 }
3978
3979 ret = rbd_dev_probe_finish(rbd_dev);
3980 if (ret)
3981 rbd_header_free(&rbd_dev->header);
3982
Alex Eldera30b71b2012-07-10 20:30:11 -05003983 return ret;
3984}
3985
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003986static ssize_t rbd_add(struct bus_type *bus,
3987 const char *buf,
3988 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003989{
Alex Eldercb8627c2012-07-09 21:04:23 -05003990 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003991 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003992 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003993 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003994 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003995 struct ceph_osd_client *osdc;
3996 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003997
3998 if (!try_module_get(THIS_MODULE))
3999 return -ENODEV;
4000
Alex Eldera725f65e2012-02-02 08:13:30 -06004001 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004002 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004003 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004004 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004005
Alex Elder9d3997f2012-10-25 23:34:42 -05004006 rbdc = rbd_get_client(ceph_opts);
4007 if (IS_ERR(rbdc)) {
4008 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004009 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004010 }
Alex Elderc53d5892012-10-25 23:34:42 -05004011 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004012
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004013 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004014 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004015 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004016 if (rc < 0)
4017 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004018 spec->pool_id = (u64) rc;
4019
Alex Elder0903e872012-11-14 12:25:19 -06004020 /* The ceph file layout needs to fit pool id in 32 bits */
4021
4022 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4023 rc = -EIO;
4024 goto err_out_client;
4025 }
4026
Alex Elderc53d5892012-10-25 23:34:42 -05004027 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004028 if (!rbd_dev)
4029 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004030 rbdc = NULL; /* rbd_dev now owns this */
4031 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004032
Alex Elderbd4ba652012-10-25 23:34:42 -05004033 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004034 kfree(rbd_opts);
4035 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004036
Alex Eldera30b71b2012-07-10 20:30:11 -05004037 rc = rbd_dev_probe(rbd_dev);
4038 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004039 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004040
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004041 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004042err_out_rbd_dev:
4043 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004044err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004045 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004046err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004047 if (ceph_opts)
4048 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004049 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004050 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004051err_out_module:
4052 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004053
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004054 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004055
4056 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004057}
4058
Alex Elderde71a292012-07-03 16:01:19 -05004059static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004060{
4061 struct list_head *tmp;
4062 struct rbd_device *rbd_dev;
4063
Alex Eldere124a822012-01-29 13:57:44 -06004064 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004065 list_for_each(tmp, &rbd_dev_list) {
4066 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004067 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06004068 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004069 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06004070 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004071 }
Alex Eldere124a822012-01-29 13:57:44 -06004072 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004073 return NULL;
4074}
4075
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004076static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004077{
Alex Elder593a9e72012-02-07 12:03:37 -06004078 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004079
Alex Elder1dbb4392012-01-24 10:08:37 -06004080 if (rbd_dev->watch_request) {
4081 struct ceph_client *client = rbd_dev->rbd_client->client;
4082
4083 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004084 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06004085 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004086 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004087 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004088
4089 /* clean up and free blkdev */
4090 rbd_free_disk(rbd_dev);
4091 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004092
Alex Elder2ac4e752012-07-10 20:30:10 -05004093 /* release allocated disk header fields */
4094 rbd_header_free(&rbd_dev->header);
4095
Alex Elder32eec682012-02-08 16:11:14 -06004096 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004097 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004098 rbd_assert(rbd_dev->rbd_client != NULL);
4099 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004100
4101 /* release module ref */
4102 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004103}
4104
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004105static ssize_t rbd_remove(struct bus_type *bus,
4106 const char *buf,
4107 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004108{
4109 struct rbd_device *rbd_dev = NULL;
4110 int target_id, rc;
4111 unsigned long ul;
4112 int ret = count;
4113
4114 rc = strict_strtoul(buf, 10, &ul);
4115 if (rc)
4116 return rc;
4117
4118 /* convert to int; abort if we lost anything in the conversion */
4119 target_id = (int) ul;
4120 if (target_id != ul)
4121 return -EINVAL;
4122
4123 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4124
4125 rbd_dev = __rbd_get_dev(target_id);
4126 if (!rbd_dev) {
4127 ret = -ENOENT;
4128 goto done;
4129 }
4130
Alex Elder42382b72012-11-16 09:29:16 -06004131 if (rbd_dev->open_count) {
4132 ret = -EBUSY;
4133 goto done;
4134 }
4135
Alex Elder41f38c22012-10-25 23:34:40 -05004136 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004137 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004138
4139done:
4140 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05004141
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004142 return ret;
4143}
4144
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004145/*
4146 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004147 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004148 */
4149static int rbd_sysfs_init(void)
4150{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004151 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004152
Alex Elderfed4c142012-02-07 12:03:36 -06004153 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004154 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004155 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004156
Alex Elderfed4c142012-02-07 12:03:36 -06004157 ret = bus_register(&rbd_bus_type);
4158 if (ret < 0)
4159 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004160
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004161 return ret;
4162}
4163
4164static void rbd_sysfs_cleanup(void)
4165{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004166 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06004167 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004168}
4169
4170int __init rbd_init(void)
4171{
4172 int rc;
4173
4174 rc = rbd_sysfs_init();
4175 if (rc)
4176 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06004177 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004178 return 0;
4179}
4180
4181void __exit rbd_exit(void)
4182{
4183 rbd_sysfs_cleanup();
4184}
4185
4186module_init(rbd_init);
4187module_exit(rbd_exit);
4188
4189MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4190MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4191MODULE_DESCRIPTION("rados block device");
4192
4193/* following authorship retained from original osdblk.c */
4194MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4195
4196MODULE_LICENSE("GPL");