blob: 3f5eaea444a0d08fb816c8d49d2f49946f2ccb5a [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
Alex Elderbf0d5f502012-11-22 00:00:08 -0600165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
Alex Elder788e2df2013-01-17 12:25:27 -0600173enum obj_request_type { OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600174
175struct rbd_obj_request {
176 const char *object_name;
177 u64 offset; /* object start byte */
178 u64 length; /* bytes from offset */
179
180 struct rbd_img_request *img_request;
181 struct list_head links; /* img_request->obj_requests */
182 u32 which; /* posn image request list */
183
184 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600185 union {
186 struct bio *bio_list;
187 struct {
188 struct page **pages;
189 u32 page_count;
190 };
191 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600192
193 struct ceph_osd_request *osd_req;
194
195 u64 xferred; /* bytes transferred */
196 u64 version;
197 s32 result;
198 atomic_t done;
199
200 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600201 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600202
203 struct kref kref;
204};
205
206struct rbd_img_request {
207 struct request *rq;
208 struct rbd_device *rbd_dev;
209 u64 offset; /* starting image byte offset */
210 u64 length; /* byte count from offset */
211 bool write_request; /* false for read */
212 union {
213 struct ceph_snap_context *snapc; /* for writes */
214 u64 snap_id; /* for reads */
215 };
216 spinlock_t completion_lock;/* protects next_completion */
217 u32 next_completion;
218 rbd_img_callback_t callback;
219
220 u32 obj_request_count;
221 struct list_head obj_requests; /* rbd_obj_request structs */
222
223 struct kref kref;
224};
225
226#define for_each_obj_request(ireq, oreq) \
227 list_for_each_entry(oreq, &ireq->obj_requests, links)
228#define for_each_obj_request_from(ireq, oreq) \
229 list_for_each_entry_from(oreq, &ireq->obj_requests, links)
230#define for_each_obj_request_safe(ireq, oreq, n) \
231 list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
232
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233struct rbd_snap {
234 struct device dev;
235 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800236 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800237 struct list_head node;
238 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500239 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800240};
241
Alex Elderf84344f2012-08-31 17:29:51 -0500242struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500243 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500244 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500245 bool read_only;
246};
247
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248/*
249 * a single device
250 */
251struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500252 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700253
254 int major; /* blkdev assigned major */
255 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700256
Alex Eldera30b71b2012-07-10 20:30:11 -0500257 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258 struct rbd_client *rbd_client;
259
260 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
261
262 spinlock_t lock; /* queue lock */
263
264 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600265 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500266 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500268 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500269
Alex Elder0903e872012-11-14 12:25:19 -0600270 struct ceph_file_layout layout;
271
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700272 struct ceph_osd_event *watch_event;
273 struct ceph_osd_request *watch_request;
274
Alex Elder86b00e02012-10-25 23:34:42 -0500275 struct rbd_spec *parent_spec;
276 u64 parent_overlap;
277
Josh Durginc6666012011-11-21 17:11:12 -0800278 /* protects updating the header */
279 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500280
281 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700282
283 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800284
285 /* list of snapshots */
286 struct list_head snaps;
287
288 /* sysfs related */
289 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600290 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800291};
292
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600296static DEFINE_SPINLOCK(rbd_dev_list_lock);
297
Alex Elder432b8582012-01-29 13:57:44 -0600298static LIST_HEAD(rbd_client_list); /* clients */
299static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700300
Alex Elder304f6802012-08-31 17:29:52 -0500301static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
302static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
303
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800304static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500305static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800306
Alex Elderf0f8cef2012-01-29 13:57:44 -0600307static ssize_t rbd_add(struct bus_type *bus, const char *buf,
308 size_t count);
309static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
310 size_t count);
311
312static struct bus_attribute rbd_bus_attrs[] = {
313 __ATTR(add, S_IWUSR, NULL, rbd_add),
314 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
315 __ATTR_NULL
316};
317
318static struct bus_type rbd_bus_type = {
319 .name = "rbd",
320 .bus_attrs = rbd_bus_attrs,
321};
322
323static void rbd_root_dev_release(struct device *dev)
324{
325}
326
327static struct device rbd_root_dev = {
328 .init_name = "rbd",
329 .release = rbd_root_dev_release,
330};
331
Alex Elder06ecc6c2012-11-01 10:17:15 -0500332static __printf(2, 3)
333void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
334{
335 struct va_format vaf;
336 va_list args;
337
338 va_start(args, fmt);
339 vaf.fmt = fmt;
340 vaf.va = &args;
341
342 if (!rbd_dev)
343 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
344 else if (rbd_dev->disk)
345 printk(KERN_WARNING "%s: %s: %pV\n",
346 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
347 else if (rbd_dev->spec && rbd_dev->spec->image_name)
348 printk(KERN_WARNING "%s: image %s: %pV\n",
349 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
350 else if (rbd_dev->spec && rbd_dev->spec->image_id)
351 printk(KERN_WARNING "%s: id %s: %pV\n",
352 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
353 else /* punt */
354 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
355 RBD_DRV_NAME, rbd_dev, &vaf);
356 va_end(args);
357}
358
Alex Elderaafb2302012-09-06 16:00:54 -0500359#ifdef RBD_DEBUG
360#define rbd_assert(expr) \
361 if (unlikely(!(expr))) { \
362 printk(KERN_ERR "\nAssertion failure in %s() " \
363 "at line %d:\n\n" \
364 "\trbd_assert(%s);\n\n", \
365 __func__, __LINE__, #expr); \
366 BUG(); \
367 }
368#else /* !RBD_DEBUG */
369# define rbd_assert(expr) ((void) 0)
370#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800371
Alex Elder117973f2012-08-31 17:29:55 -0500372static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
373static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700374
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700375static int rbd_open(struct block_device *bdev, fmode_t mode)
376{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600377 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700378
Alex Elderf84344f2012-08-31 17:29:51 -0500379 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380 return -EROFS;
381
Alex Elder42382b72012-11-16 09:29:16 -0600382 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600383 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500384 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600385 rbd_dev->open_count++;
386 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700387
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700388 return 0;
389}
390
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800391static int rbd_release(struct gendisk *disk, fmode_t mode)
392{
393 struct rbd_device *rbd_dev = disk->private_data;
394
Alex Elder42382b72012-11-16 09:29:16 -0600395 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
396 rbd_assert(rbd_dev->open_count > 0);
397 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600398 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600399 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800400
401 return 0;
402}
403
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700404static const struct block_device_operations rbd_bd_ops = {
405 .owner = THIS_MODULE,
406 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800407 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700408};
409
410/*
411 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500412 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700413 */
Alex Elderf8c38922012-08-10 13:12:07 -0700414static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700415{
416 struct rbd_client *rbdc;
417 int ret = -ENOMEM;
418
419 dout("rbd_client_create\n");
420 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
421 if (!rbdc)
422 goto out_opt;
423
424 kref_init(&rbdc->kref);
425 INIT_LIST_HEAD(&rbdc->node);
426
Alex Elderbc534d82012-01-29 13:57:44 -0600427 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
428
Alex Elder43ae4702012-07-03 16:01:18 -0500429 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700430 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600431 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500432 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433
434 ret = ceph_open_session(rbdc->client);
435 if (ret < 0)
436 goto out_err;
437
Alex Elder432b8582012-01-29 13:57:44 -0600438 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600440 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441
Alex Elderbc534d82012-01-29 13:57:44 -0600442 mutex_unlock(&ctl_mutex);
443
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444 dout("rbd_client_create created %p\n", rbdc);
445 return rbdc;
446
447out_err:
448 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600449out_mutex:
450 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451 kfree(rbdc);
452out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500453 if (ceph_opts)
454 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400455 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456}
457
458/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700459 * Find a ceph client with specific addr and configuration. If
460 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700461 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700462static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463{
464 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700465 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700466
Alex Elder43ae4702012-07-03 16:01:18 -0500467 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468 return NULL;
469
Alex Elder1f7ba332012-08-10 13:12:07 -0700470 spin_lock(&rbd_client_list_lock);
471 list_for_each_entry(client_node, &rbd_client_list, node) {
472 if (!ceph_compare_options(ceph_opts, client_node->client)) {
473 kref_get(&client_node->kref);
474 found = true;
475 break;
476 }
477 }
478 spin_unlock(&rbd_client_list_lock);
479
480 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481}
482
483/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700484 * mount options
485 */
486enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700487 Opt_last_int,
488 /* int args above */
489 Opt_last_string,
490 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700491 Opt_read_only,
492 Opt_read_write,
493 /* Boolean args above */
494 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700495};
496
Alex Elder43ae4702012-07-03 16:01:18 -0500497static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700498 /* int args above */
499 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500500 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700501 {Opt_read_only, "ro"}, /* Alternate spelling */
502 {Opt_read_write, "read_write"},
503 {Opt_read_write, "rw"}, /* Alternate spelling */
504 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700505 {-1, NULL}
506};
507
Alex Elder98571b52013-01-20 14:44:42 -0600508struct rbd_options {
509 bool read_only;
510};
511
512#define RBD_READ_ONLY_DEFAULT false
513
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700514static int parse_rbd_opts_token(char *c, void *private)
515{
Alex Elder43ae4702012-07-03 16:01:18 -0500516 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700517 substring_t argstr[MAX_OPT_ARGS];
518 int token, intval, ret;
519
Alex Elder43ae4702012-07-03 16:01:18 -0500520 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700521 if (token < 0)
522 return -EINVAL;
523
524 if (token < Opt_last_int) {
525 ret = match_int(&argstr[0], &intval);
526 if (ret < 0) {
527 pr_err("bad mount option arg (not int) "
528 "at '%s'\n", c);
529 return ret;
530 }
531 dout("got int token %d val %d\n", token, intval);
532 } else if (token > Opt_last_int && token < Opt_last_string) {
533 dout("got string token %d val %s\n", token,
534 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700535 } else if (token > Opt_last_string && token < Opt_last_bool) {
536 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700537 } else {
538 dout("got token %d\n", token);
539 }
540
541 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700542 case Opt_read_only:
543 rbd_opts->read_only = true;
544 break;
545 case Opt_read_write:
546 rbd_opts->read_only = false;
547 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700548 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500549 rbd_assert(false);
550 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700551 }
552 return 0;
553}
554
555/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700556 * Get a ceph client with specific addr and configuration, if one does
557 * not exist create it.
558 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500559static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700560{
Alex Elderf8c38922012-08-10 13:12:07 -0700561 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700562
Alex Elder1f7ba332012-08-10 13:12:07 -0700563 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500564 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500565 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500566 else
Alex Elderf8c38922012-08-10 13:12:07 -0700567 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700568
Alex Elder9d3997f2012-10-25 23:34:42 -0500569 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570}
571
572/*
573 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600574 *
Alex Elder432b8582012-01-29 13:57:44 -0600575 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700576 */
577static void rbd_client_release(struct kref *kref)
578{
579 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
580
581 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500582 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700583 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500584 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585
586 ceph_destroy_client(rbdc->client);
587 kfree(rbdc);
588}
589
590/*
591 * Drop reference to ceph client node. If it's not referenced anymore, release
592 * it.
593 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500594static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700595{
Alex Elderc53d5892012-10-25 23:34:42 -0500596 if (rbdc)
597 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700598}
599
Alex Eldera30b71b2012-07-10 20:30:11 -0500600static bool rbd_image_format_valid(u32 image_format)
601{
602 return image_format == 1 || image_format == 2;
603}
604
Alex Elder8e94af82012-07-25 09:32:40 -0500605static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
606{
Alex Elder103a1502012-08-02 11:29:45 -0500607 size_t size;
608 u32 snap_count;
609
610 /* The header has to start with the magic rbd header text */
611 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
612 return false;
613
Alex Elderdb2388b2012-10-20 22:17:27 -0500614 /* The bio layer requires at least sector-sized I/O */
615
616 if (ondisk->options.order < SECTOR_SHIFT)
617 return false;
618
619 /* If we use u64 in a few spots we may be able to loosen this */
620
621 if (ondisk->options.order > 8 * sizeof (int) - 1)
622 return false;
623
Alex Elder103a1502012-08-02 11:29:45 -0500624 /*
625 * The size of a snapshot header has to fit in a size_t, and
626 * that limits the number of snapshots.
627 */
628 snap_count = le32_to_cpu(ondisk->snap_count);
629 size = SIZE_MAX - sizeof (struct ceph_snap_context);
630 if (snap_count > size / sizeof (__le64))
631 return false;
632
633 /*
634 * Not only that, but the size of the entire the snapshot
635 * header must also be representable in a size_t.
636 */
637 size -= snap_count * sizeof (__le64);
638 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
639 return false;
640
641 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500642}
643
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644/*
645 * Create a new header structure, translate header format from the on-disk
646 * header.
647 */
648static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500649 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650{
Alex Elderccece232012-07-10 20:30:10 -0500651 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500652 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500653 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500654 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700655
Alex Elder6a523252012-07-19 17:12:59 -0500656 memset(header, 0, sizeof (*header));
657
Alex Elder103a1502012-08-02 11:29:45 -0500658 snap_count = le32_to_cpu(ondisk->snap_count);
659
Alex Elder58c17b02012-08-23 23:22:06 -0500660 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
661 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500662 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500664 memcpy(header->object_prefix, ondisk->object_prefix, len);
665 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600666
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500668 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
669
Alex Elder621901d2012-08-23 23:22:06 -0500670 /* Save a copy of the snapshot names */
671
Alex Elderf785cc12012-08-23 23:22:06 -0500672 if (snap_names_len > (u64) SIZE_MAX)
673 return -EIO;
674 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700675 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500676 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500677 /*
678 * Note that rbd_dev_v1_header_read() guarantees
679 * the ondisk buffer we're working with has
680 * snap_names_len bytes beyond the end of the
681 * snapshot id array, this memcpy() is safe.
682 */
683 memcpy(header->snap_names, &ondisk->snaps[snap_count],
684 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500685
Alex Elder621901d2012-08-23 23:22:06 -0500686 /* Record each snapshot's size */
687
Alex Elderd2bb24e2012-07-26 23:37:14 -0500688 size = snap_count * sizeof (*header->snap_sizes);
689 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500691 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500692 for (i = 0; i < snap_count; i++)
693 header->snap_sizes[i] =
694 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695 } else {
Alex Elderccece232012-07-10 20:30:10 -0500696 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 header->snap_names = NULL;
698 header->snap_sizes = NULL;
699 }
Alex Elder849b4262012-07-09 21:04:24 -0500700
Alex Elder34b13182012-07-13 20:35:12 -0500701 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702 header->obj_order = ondisk->options.order;
703 header->crypt_type = ondisk->options.crypt_type;
704 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500705
Alex Elder621901d2012-08-23 23:22:06 -0500706 /* Allocate and fill in the snapshot context */
707
Alex Elderf84344f2012-08-31 17:29:51 -0500708 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500709 size = sizeof (struct ceph_snap_context);
710 size += snap_count * sizeof (header->snapc->snaps[0]);
711 header->snapc = kzalloc(size, GFP_KERNEL);
712 if (!header->snapc)
713 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700714
715 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500716 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700717 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500718 for (i = 0; i < snap_count; i++)
719 header->snapc->snaps[i] =
720 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721
722 return 0;
723
Alex Elder6a523252012-07-19 17:12:59 -0500724out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500725 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500726 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500728 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500729 kfree(header->object_prefix);
730 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500731
Alex Elder00f1f362012-02-07 12:03:36 -0600732 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733}
734
Alex Elder9e15b772012-10-30 19:40:33 -0500735static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
736{
737 struct rbd_snap *snap;
738
739 if (snap_id == CEPH_NOSNAP)
740 return RBD_SNAP_HEAD_NAME;
741
742 list_for_each_entry(snap, &rbd_dev->snaps, node)
743 if (snap_id == snap->id)
744 return snap->name;
745
746 return NULL;
747}
748
Alex Elder8836b992012-08-30 14:42:15 -0500749static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751
Alex Eldere86924a2012-07-10 20:30:11 -0500752 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600753
Alex Eldere86924a2012-07-10 20:30:11 -0500754 list_for_each_entry(snap, &rbd_dev->snaps, node) {
755 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500756 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500757 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500758 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600759
Alex Eldere86924a2012-07-10 20:30:11 -0500760 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600761 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700762 }
Alex Eldere86924a2012-07-10 20:30:11 -0500763
Alex Elder00f1f362012-02-07 12:03:36 -0600764 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700765}
766
Alex Elder819d52b2012-10-25 23:34:41 -0500767static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700768{
Alex Elder78dc4472012-07-19 08:49:18 -0500769 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500771 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800772 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500773 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500774 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500775 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500776 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700777 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500778 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779 if (ret < 0)
780 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500781 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700782 }
Alex Elderd78b6502012-11-09 08:43:15 -0600783 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700785 return ret;
786}
787
788static void rbd_header_free(struct rbd_image_header *header)
789{
Alex Elder849b4262012-07-09 21:04:24 -0500790 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500791 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700792 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500793 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500794 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500795 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800796 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500797 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700798}
799
Alex Elder98571b52013-01-20 14:44:42 -0600800static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700801{
Alex Elder65ccfe22012-08-09 10:33:26 -0700802 char *name;
803 u64 segment;
804 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700805
Alex Elder2fd82b92012-11-09 15:05:54 -0600806 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700807 if (!name)
808 return NULL;
809 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600810 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700811 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600812 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700813 pr_err("error formatting segment name for #%llu (%d)\n",
814 segment, ret);
815 kfree(name);
816 name = NULL;
817 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700818
Alex Elder65ccfe22012-08-09 10:33:26 -0700819 return name;
820}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700821
Alex Elder65ccfe22012-08-09 10:33:26 -0700822static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
823{
824 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700825
Alex Elder65ccfe22012-08-09 10:33:26 -0700826 return offset & (segment_size - 1);
827}
828
829static u64 rbd_segment_length(struct rbd_device *rbd_dev,
830 u64 offset, u64 length)
831{
832 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
833
834 offset &= segment_size - 1;
835
Alex Elderaafb2302012-09-06 16:00:54 -0500836 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700837 if (offset + length > segment_size)
838 length = segment_size - offset;
839
840 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700841}
842
843/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700844 * returns the size of an object in the image
845 */
846static u64 rbd_obj_bytes(struct rbd_image_header *header)
847{
848 return 1 << header->obj_order;
849}
850
851/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700852 * bio helpers
853 */
854
855static void bio_chain_put(struct bio *chain)
856{
857 struct bio *tmp;
858
859 while (chain) {
860 tmp = chain;
861 chain = chain->bi_next;
862 bio_put(tmp);
863 }
864}
865
866/*
867 * zeros a bio chain, starting at specific offset
868 */
869static void zero_bio_chain(struct bio *chain, int start_ofs)
870{
871 struct bio_vec *bv;
872 unsigned long flags;
873 void *buf;
874 int i;
875 int pos = 0;
876
877 while (chain) {
878 bio_for_each_segment(bv, chain, i) {
879 if (pos + bv->bv_len > start_ofs) {
880 int remainder = max(start_ofs - pos, 0);
881 buf = bvec_kmap_irq(bv, &flags);
882 memset(buf + remainder, 0,
883 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200884 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885 }
886 pos += bv->bv_len;
887 }
888
889 chain = chain->bi_next;
890 }
891}
892
893/*
Alex Elderf7760da2012-10-20 22:17:27 -0500894 * Clone a portion of a bio, starting at the given byte offset
895 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896 */
Alex Elderf7760da2012-10-20 22:17:27 -0500897static struct bio *bio_clone_range(struct bio *bio_src,
898 unsigned int offset,
899 unsigned int len,
900 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901{
Alex Elderf7760da2012-10-20 22:17:27 -0500902 struct bio_vec *bv;
903 unsigned int resid;
904 unsigned short idx;
905 unsigned int voff;
906 unsigned short end_idx;
907 unsigned short vcnt;
908 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700909
Alex Elderf7760da2012-10-20 22:17:27 -0500910 /* Handle the easy case for the caller */
911
912 if (!offset && len == bio_src->bi_size)
913 return bio_clone(bio_src, gfpmask);
914
915 if (WARN_ON_ONCE(!len))
916 return NULL;
917 if (WARN_ON_ONCE(len > bio_src->bi_size))
918 return NULL;
919 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
920 return NULL;
921
922 /* Find first affected segment... */
923
924 resid = offset;
925 __bio_for_each_segment(bv, bio_src, idx, 0) {
926 if (resid < bv->bv_len)
927 break;
928 resid -= bv->bv_len;
929 }
930 voff = resid;
931
932 /* ...and the last affected segment */
933
934 resid += len;
935 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
936 if (resid <= bv->bv_len)
937 break;
938 resid -= bv->bv_len;
939 }
940 vcnt = end_idx - idx + 1;
941
942 /* Build the clone */
943
944 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
945 if (!bio)
946 return NULL; /* ENOMEM */
947
948 bio->bi_bdev = bio_src->bi_bdev;
949 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
950 bio->bi_rw = bio_src->bi_rw;
951 bio->bi_flags |= 1 << BIO_CLONED;
952
953 /*
954 * Copy over our part of the bio_vec, then update the first
955 * and last (or only) entries.
956 */
957 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
958 vcnt * sizeof (struct bio_vec));
959 bio->bi_io_vec[0].bv_offset += voff;
960 if (vcnt > 1) {
961 bio->bi_io_vec[0].bv_len -= voff;
962 bio->bi_io_vec[vcnt - 1].bv_len = resid;
963 } else {
964 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700965 }
966
Alex Elderf7760da2012-10-20 22:17:27 -0500967 bio->bi_vcnt = vcnt;
968 bio->bi_size = len;
969 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700970
Alex Elderf7760da2012-10-20 22:17:27 -0500971 return bio;
972}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700973
Alex Elderf7760da2012-10-20 22:17:27 -0500974/*
975 * Clone a portion of a bio chain, starting at the given byte offset
976 * into the first bio in the source chain and continuing for the
977 * number of bytes indicated. The result is another bio chain of
978 * exactly the given length, or a null pointer on error.
979 *
980 * The bio_src and offset parameters are both in-out. On entry they
981 * refer to the first source bio and the offset into that bio where
982 * the start of data to be cloned is located.
983 *
984 * On return, bio_src is updated to refer to the bio in the source
985 * chain that contains first un-cloned byte, and *offset will
986 * contain the offset of that byte within that bio.
987 */
988static struct bio *bio_chain_clone_range(struct bio **bio_src,
989 unsigned int *offset,
990 unsigned int len,
991 gfp_t gfpmask)
992{
993 struct bio *bi = *bio_src;
994 unsigned int off = *offset;
995 struct bio *chain = NULL;
996 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700997
Alex Elderf7760da2012-10-20 22:17:27 -0500998 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Alex Elderf7760da2012-10-20 22:17:27 -05001000 if (!bi || off >= bi->bi_size || !len)
1001 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001002
Alex Elderf7760da2012-10-20 22:17:27 -05001003 end = &chain;
1004 while (len) {
1005 unsigned int bi_size;
1006 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001007
Alex Elderf5400b72012-11-01 10:17:15 -05001008 if (!bi) {
1009 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001010 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001011 }
Alex Elderf7760da2012-10-20 22:17:27 -05001012 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1013 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1014 if (!bio)
1015 goto out_err; /* ENOMEM */
1016
1017 *end = bio;
1018 end = &bio->bi_next;
1019
1020 off += bi_size;
1021 if (off == bi->bi_size) {
1022 bi = bi->bi_next;
1023 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024 }
Alex Elderf7760da2012-10-20 22:17:27 -05001025 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026 }
Alex Elderf7760da2012-10-20 22:17:27 -05001027 *bio_src = bi;
1028 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001029
Alex Elderf7760da2012-10-20 22:17:27 -05001030 return chain;
1031out_err:
1032 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001033
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001034 return NULL;
1035}
1036
Alex Elderbf0d5f502012-11-22 00:00:08 -06001037static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1038{
1039 kref_get(&obj_request->kref);
1040}
1041
1042static void rbd_obj_request_destroy(struct kref *kref);
1043static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1044{
1045 rbd_assert(obj_request != NULL);
1046 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1047}
1048
1049static void rbd_img_request_get(struct rbd_img_request *img_request)
1050{
1051 kref_get(&img_request->kref);
1052}
1053
1054static void rbd_img_request_destroy(struct kref *kref);
1055static void rbd_img_request_put(struct rbd_img_request *img_request)
1056{
1057 rbd_assert(img_request != NULL);
1058 kref_put(&img_request->kref, rbd_img_request_destroy);
1059}
1060
1061static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1062 struct rbd_obj_request *obj_request)
1063{
1064 rbd_obj_request_get(obj_request);
1065 obj_request->img_request = img_request;
1066 list_add_tail(&obj_request->links, &img_request->obj_requests);
1067 obj_request->which = img_request->obj_request_count++;
1068 rbd_assert(obj_request->which != BAD_WHICH);
1069}
1070
1071static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1072 struct rbd_obj_request *obj_request)
1073{
1074 rbd_assert(obj_request->which != BAD_WHICH);
1075 obj_request->which = BAD_WHICH;
1076 list_del(&obj_request->links);
1077 rbd_assert(obj_request->img_request == img_request);
1078 obj_request->callback = NULL;
1079 obj_request->img_request = NULL;
1080 rbd_obj_request_put(obj_request);
1081}
1082
1083static bool obj_request_type_valid(enum obj_request_type type)
1084{
1085 switch (type) {
1086 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001087 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001088 return true;
1089 default:
1090 return false;
1091 }
1092}
1093
Alex Elder8d23bf22012-11-19 22:55:21 -06001094struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1095{
1096 struct ceph_osd_req_op *op;
1097 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001098 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001099
1100 op = kzalloc(sizeof (*op), GFP_NOIO);
1101 if (!op)
1102 return NULL;
1103 op->op = opcode;
1104 va_start(args, opcode);
1105 switch (opcode) {
1106 case CEPH_OSD_OP_READ:
1107 case CEPH_OSD_OP_WRITE:
1108 /* rbd_osd_req_op_create(READ, offset, length) */
1109 /* rbd_osd_req_op_create(WRITE, offset, length) */
1110 op->extent.offset = va_arg(args, u64);
1111 op->extent.length = va_arg(args, u64);
1112 if (opcode == CEPH_OSD_OP_WRITE)
1113 op->payload_len = op->extent.length;
1114 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001115 case CEPH_OSD_OP_CALL:
1116 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1117 op->cls.class_name = va_arg(args, char *);
1118 size = strlen(op->cls.class_name);
1119 rbd_assert(size <= (size_t) U8_MAX);
1120 op->cls.class_len = size;
1121 op->payload_len = size;
1122
1123 op->cls.method_name = va_arg(args, char *);
1124 size = strlen(op->cls.method_name);
1125 rbd_assert(size <= (size_t) U8_MAX);
1126 op->cls.method_len = size;
1127 op->payload_len += size;
1128
1129 op->cls.argc = 0;
1130 op->cls.indata = va_arg(args, void *);
1131 size = va_arg(args, size_t);
1132 rbd_assert(size <= (size_t) U32_MAX);
1133 op->cls.indata_len = (u32) size;
1134 op->payload_len += size;
1135 break;
Alex Elder5efea492012-11-19 22:55:21 -06001136 case CEPH_OSD_OP_NOTIFY_ACK:
1137 case CEPH_OSD_OP_WATCH:
1138 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1139 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1140 op->watch.cookie = va_arg(args, u64);
1141 op->watch.ver = va_arg(args, u64);
1142 op->watch.ver = cpu_to_le64(op->watch.ver);
1143 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1144 op->watch.flag = (u8) 1;
1145 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001146 default:
1147 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1148 kfree(op);
1149 op = NULL;
1150 break;
1151 }
1152 va_end(args);
1153
1154 return op;
1155}
1156
1157static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1158{
1159 kfree(op);
1160}
1161
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162/*
1163 * Send ceph osd request
1164 */
1165static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001166 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167 struct ceph_snap_context *snapc,
1168 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001169 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170 struct bio *bio,
1171 struct page **pages,
1172 int num_pages,
1173 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001174 struct ceph_osd_req_op *op,
Alex Elder5f29ddd2012-11-08 08:01:39 -06001175 void (*rbd_cb)(struct ceph_osd_request *,
1176 struct ceph_msg *),
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001177 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178{
Alex Elder1dbb4392012-01-24 10:08:37 -06001179 struct ceph_osd_client *osdc;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001180 struct ceph_osd_request *osd_req;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001181 struct timespec mtime = CURRENT_TIME;
1182 int ret;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001183
Alex Elder7d250b92012-11-30 17:53:04 -06001184 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n",
Alex Elderf7760da2012-10-20 22:17:27 -05001185 object_name, (unsigned long long) ofs,
Alex Elder7d250b92012-11-30 17:53:04 -06001186 (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001187
Alex Elder0ce1a792012-07-03 16:01:18 -05001188 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder30573d62012-11-13 21:11:15 -06001189 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001190 if (!osd_req)
1191 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001192
Alex Elderd178a9e2012-11-13 21:11:15 -06001193 osd_req->r_flags = flags;
Alex Elder54a54002012-11-13 21:11:15 -06001194 osd_req->r_pages = pages;
1195 if (bio) {
1196 osd_req->r_bio = bio;
1197 bio_get(osd_req->r_bio);
1198 }
Alex Elder2e53c6c2012-11-30 09:59:47 -06001199
Alex Elder5f29ddd2012-11-08 08:01:39 -06001200 osd_req->r_callback = rbd_cb;
Alex Elder7d250b92012-11-30 17:53:04 -06001201 osd_req->r_priv = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001202
Alex Elder5f29ddd2012-11-08 08:01:39 -06001203 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1204 osd_req->r_oid_len = strlen(osd_req->r_oid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205
Alex Elder0903e872012-11-14 12:25:19 -06001206 osd_req->r_file_layout = rbd_dev->layout; /* struct */
Alex Eldere01e7922012-11-14 12:25:18 -06001207 osd_req->r_num_pages = calc_pages_for(ofs, len);
1208 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001209
Alex Elder30573d62012-11-13 21:11:15 -06001210 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
Alex Elderae7ca4a32012-11-13 21:11:15 -06001211 snapc, snapid, &mtime);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212
Alex Elder8b84de72012-11-20 14:17:17 -06001213 if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001214 ceph_osdc_set_request_linger(osdc, osd_req);
Alex Elder8b84de72012-11-20 14:17:17 -06001215 rbd_dev->watch_request = osd_req;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001216 }
1217
Alex Elder5f29ddd2012-11-08 08:01:39 -06001218 ret = ceph_osdc_start_request(osdc, osd_req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001219 if (ret < 0)
1220 goto done_err;
1221
1222 if (!rbd_cb) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001223 u64 version;
1224
1225 ret = ceph_osdc_wait_request(osdc, osd_req);
1226 version = le64_to_cpu(osd_req->r_reassert_version.version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001227 if (ver)
Alex Elder5f29ddd2012-11-08 08:01:39 -06001228 *ver = version;
1229 dout("reassert_ver=%llu\n", (unsigned long long) version);
1230 ceph_osdc_put_request(osd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231 }
1232 return ret;
1233
1234done_err:
Alex Elder2e53c6c2012-11-30 09:59:47 -06001235 if (bio)
1236 bio_chain_put(osd_req->r_bio);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001237 ceph_osdc_put_request(osd_req);
1238
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239 return ret;
1240}
1241
Alex Elder5f29ddd2012-11-08 08:01:39 -06001242static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1243 struct ceph_msg *msg)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001244{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001245 ceph_osdc_put_request(osd_req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001246}
1247
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001248/*
1249 * Do a synchronous ceph osd operation
1250 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001251static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001252 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001253 struct ceph_osd_req_op *op,
Alex Elderaded07e2012-07-03 16:01:18 -05001254 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001255 u64 ofs, u64 inbound_size,
1256 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001257 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001258{
1259 int ret;
1260 struct page **pages;
1261 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001262
Alex Elder30573d62012-11-13 21:11:15 -06001263 rbd_assert(op != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001264
Alex Elderf8d4de62012-07-03 16:01:19 -05001265 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001266 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001267 if (IS_ERR(pages))
1268 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001269
Alex Elder25704ac2012-11-09 08:43:16 -06001270 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderf8d4de62012-07-03 16:01:19 -05001271 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272 pages, num_pages,
1273 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001274 op,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001275 NULL,
Alex Elder8b84de72012-11-20 14:17:17 -06001276 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001277 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001278 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001279
Alex Elderf8d4de62012-07-03 16:01:19 -05001280 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1281 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001282
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001283done:
1284 ceph_release_page_vector(pages, num_pages);
1285 return ret;
1286}
1287
Alex Elderbf0d5f502012-11-22 00:00:08 -06001288static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1289 struct rbd_obj_request *obj_request)
1290{
1291 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1292}
1293
1294static void rbd_img_request_complete(struct rbd_img_request *img_request)
1295{
1296 if (img_request->callback)
1297 img_request->callback(img_request);
1298 else
1299 rbd_img_request_put(img_request);
1300}
1301
Alex Elder788e2df2013-01-17 12:25:27 -06001302/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1303
1304static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1305{
1306 return wait_for_completion_interruptible(&obj_request->completion);
1307}
1308
Alex Elderbf0d5f502012-11-22 00:00:08 -06001309static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1310{
1311 if (obj_request->callback)
1312 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001313 else
1314 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001315}
1316
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001317/*
Alex Elder788e2df2013-01-17 12:25:27 -06001318 * Synchronously read a range from an object into a provided buffer
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001319 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001320static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001321 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001322 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001323 char *buf,
1324 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001325{
Alex Elder139b4312012-11-13 21:11:15 -06001326 struct ceph_osd_req_op *op;
Alex Elder913d2fd2012-06-26 12:57:03 -07001327 int ret;
1328
Alex Elder8d23bf22012-11-19 22:55:21 -06001329 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
Alex Elder139b4312012-11-13 21:11:15 -06001330 if (!op)
Alex Elder913d2fd2012-06-26 12:57:03 -07001331 return -ENOMEM;
1332
Alex Elder25704ac2012-11-09 08:43:16 -06001333 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
Alex Elder8b84de72012-11-20 14:17:17 -06001334 op, object_name, ofs, len, buf, ver);
Alex Elder8d23bf22012-11-19 22:55:21 -06001335 rbd_osd_req_op_destroy(op);
Alex Elder913d2fd2012-06-26 12:57:03 -07001336
1337 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001338}
1339
1340/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341 * Request sync osd watch
1342 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001343static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001344 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001345 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001346{
Alex Elder139b4312012-11-13 21:11:15 -06001347 struct ceph_osd_req_op *op;
Sage Weil11f77002011-05-12 16:13:54 -07001348 int ret;
1349
Alex Elder5efea492012-11-19 22:55:21 -06001350 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
Alex Elder139b4312012-11-13 21:11:15 -06001351 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001352 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001353
Alex Elder0ce1a792012-07-03 16:01:18 -05001354 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001355 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001356 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001357 CEPH_OSD_FLAG_READ,
Alex Elder30573d62012-11-13 21:11:15 -06001358 op,
Alex Elder8b84de72012-11-20 14:17:17 -06001359 rbd_simple_req_cb, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001360
Alex Elder5efea492012-11-19 22:55:21 -06001361 rbd_osd_req_op_destroy(op);
1362
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001363 return ret;
1364}
1365
1366static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1367{
Alex Elder0ce1a792012-07-03 16:01:18 -05001368 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001369 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001370 int rc;
1371
Alex Elder0ce1a792012-07-03 16:01:18 -05001372 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001373 return;
1374
Alex Elderbd919d42012-07-13 20:35:11 -05001375 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1376 rbd_dev->header_name, (unsigned long long) notify_id,
1377 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001378 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001379 if (rc)
Alex Elder06ecc6c2012-11-01 10:17:15 -05001380 rbd_warn(rbd_dev, "got notification but failed to "
1381 " update snaps: %d\n", rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001382
Alex Elder7f0a24d2012-07-25 09:32:40 -05001383 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001384}
1385
1386/*
Alex Elder907703d2012-11-13 21:11:15 -06001387 * Request sync osd watch/unwatch. The value of "start" determines
1388 * whether a watch request is being initiated or torn down.
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001389 */
Alex Elder907703d2012-11-13 21:11:15 -06001390static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001391{
Alex Elder5efea492012-11-19 22:55:21 -06001392 struct ceph_osd_req_op *op;
1393 int ret = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001394
Alex Elderc0430642013-01-18 12:31:09 -06001395 rbd_assert(start ^ !!rbd_dev->watch_event);
1396 rbd_assert(start ^ !!rbd_dev->watch_request);
1397
Alex Elder907703d2012-11-13 21:11:15 -06001398 if (start) {
1399 struct ceph_osd_client *osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001400
Alex Elder907703d2012-11-13 21:11:15 -06001401 osdc = &rbd_dev->rbd_client->client->osdc;
1402 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1403 &rbd_dev->watch_event);
1404 if (ret < 0)
Alex Elder5efea492012-11-19 22:55:21 -06001405 return ret;
Alex Elder907703d2012-11-13 21:11:15 -06001406 }
1407
Alex Elder5efea492012-11-19 22:55:21 -06001408 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1409 rbd_dev->watch_event->cookie,
1410 rbd_dev->header.obj_version, start);
1411 if (op)
1412 ret = rbd_req_sync_op(rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001413 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Alex Elder907703d2012-11-13 21:11:15 -06001414 op, rbd_dev->header_name,
Alex Elder8b84de72012-11-20 14:17:17 -06001415 0, 0, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001416
Alex Elder5efea492012-11-19 22:55:21 -06001417 /* Cancel the event if we're tearing down, or on error */
1418
1419 if (!start || !op || ret < 0) {
Alex Elder907703d2012-11-13 21:11:15 -06001420 ceph_osdc_cancel_event(rbd_dev->watch_event);
1421 rbd_dev->watch_event = NULL;
1422 }
Alex Elder5efea492012-11-19 22:55:21 -06001423 rbd_osd_req_op_destroy(op);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001424
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001425 return ret;
1426}
1427
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001428/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001429 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001430 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001431static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001432 const char *object_name,
1433 const char *class_name,
1434 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001435 const char *outbound,
1436 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001437 char *inbound,
1438 size_t inbound_size,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001439 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001440{
Alex Elder139b4312012-11-13 21:11:15 -06001441 struct ceph_osd_req_op *op;
Alex Elder57cfc102012-06-26 12:57:03 -07001442 int ret;
1443
Alex Elder3cb4a682012-06-26 12:57:03 -07001444 /*
1445 * Any input parameters required by the method we're calling
1446 * will be sent along with the class and method names as
1447 * part of the message payload. That data and its size are
1448 * supplied via the indata and indata_len fields (named from
1449 * the perspective of the server side) in the OSD request
1450 * operation.
1451 */
Alex Elder2647ba32012-11-19 22:55:21 -06001452 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1453 method_name, outbound, outbound_size);
Alex Elder139b4312012-11-13 21:11:15 -06001454 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001455 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001456
Alex Elder30573d62012-11-13 21:11:15 -06001457 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
Alex Elderf8d4de62012-07-03 16:01:19 -05001458 object_name, 0, inbound_size, inbound,
Alex Elder8b84de72012-11-20 14:17:17 -06001459 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001460
Alex Elder2647ba32012-11-19 22:55:21 -06001461 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001462
1463 dout("cls_exec returned %d\n", ret);
1464 return ret;
1465}
1466
Alex Elderbf0d5f502012-11-22 00:00:08 -06001467static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1468 struct ceph_osd_op *op)
1469{
1470 u64 xferred;
1471
1472 /*
1473 * We support a 64-bit length, but ultimately it has to be
1474 * passed to blk_end_request(), which takes an unsigned int.
1475 */
1476 xferred = le64_to_cpu(op->extent.length);
1477 rbd_assert(xferred < (u64) UINT_MAX);
1478 if (obj_request->result == (s32) -ENOENT) {
1479 zero_bio_chain(obj_request->bio_list, 0);
1480 obj_request->result = 0;
1481 } else if (xferred < obj_request->length && !obj_request->result) {
1482 zero_bio_chain(obj_request->bio_list, xferred);
1483 xferred = obj_request->length;
1484 }
1485 obj_request->xferred = xferred;
1486 atomic_set(&obj_request->done, 1);
1487}
1488
1489static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1490 struct ceph_osd_op *op)
1491{
1492 obj_request->xferred = le64_to_cpu(op->extent.length);
1493 atomic_set(&obj_request->done, 1);
1494}
1495
1496static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1497 struct ceph_msg *msg)
1498{
1499 struct rbd_obj_request *obj_request = osd_req->r_priv;
1500 struct ceph_osd_reply_head *reply_head;
1501 struct ceph_osd_op *op;
1502 u32 num_ops;
1503 u16 opcode;
1504
1505 rbd_assert(osd_req == obj_request->osd_req);
1506 rbd_assert(!!obj_request->img_request ^
1507 (obj_request->which == BAD_WHICH));
1508
1509 obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1510 reply_head = msg->front.iov_base;
1511 obj_request->result = (s32) le32_to_cpu(reply_head->result);
1512 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1513
1514 num_ops = le32_to_cpu(reply_head->num_ops);
1515 WARN_ON(num_ops != 1); /* For now */
1516
1517 op = &reply_head->ops[0];
1518 opcode = le16_to_cpu(op->op);
1519 switch (opcode) {
1520 case CEPH_OSD_OP_READ:
1521 rbd_osd_read_callback(obj_request, op);
1522 break;
1523 case CEPH_OSD_OP_WRITE:
1524 rbd_osd_write_callback(obj_request, op);
1525 break;
1526 default:
1527 rbd_warn(NULL, "%s: unsupported op %hu\n",
1528 obj_request->object_name, (unsigned short) opcode);
1529 break;
1530 }
1531
1532 if (atomic_read(&obj_request->done))
1533 rbd_obj_request_complete(obj_request);
1534}
1535
1536static struct ceph_osd_request *rbd_osd_req_create(
1537 struct rbd_device *rbd_dev,
1538 bool write_request,
1539 struct rbd_obj_request *obj_request,
1540 struct ceph_osd_req_op *op)
1541{
1542 struct rbd_img_request *img_request = obj_request->img_request;
1543 struct ceph_snap_context *snapc = NULL;
1544 struct ceph_osd_client *osdc;
1545 struct ceph_osd_request *osd_req;
1546 struct timespec now;
1547 struct timespec *mtime;
1548 u64 snap_id = CEPH_NOSNAP;
1549 u64 offset = obj_request->offset;
1550 u64 length = obj_request->length;
1551
1552 if (img_request) {
1553 rbd_assert(img_request->write_request == write_request);
1554 if (img_request->write_request)
1555 snapc = img_request->snapc;
1556 else
1557 snap_id = img_request->snap_id;
1558 }
1559
1560 /* Allocate and initialize the request, for the single op */
1561
1562 osdc = &rbd_dev->rbd_client->client->osdc;
1563 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1564 if (!osd_req)
1565 return NULL; /* ENOMEM */
1566
1567 rbd_assert(obj_request_type_valid(obj_request->type));
1568 switch (obj_request->type) {
1569 case OBJ_REQUEST_BIO:
1570 rbd_assert(obj_request->bio_list != NULL);
1571 osd_req->r_bio = obj_request->bio_list;
1572 bio_get(osd_req->r_bio);
1573 /* osd client requires "num pages" even for bio */
1574 osd_req->r_num_pages = calc_pages_for(offset, length);
1575 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001576 case OBJ_REQUEST_PAGES:
1577 osd_req->r_pages = obj_request->pages;
1578 osd_req->r_num_pages = obj_request->page_count;
1579 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1580 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001581 }
1582
1583 if (write_request) {
1584 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1585 now = CURRENT_TIME;
1586 mtime = &now;
1587 } else {
1588 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1589 mtime = NULL; /* not needed for reads */
1590 offset = 0; /* These are not used... */
1591 length = 0; /* ...for osd read requests */
1592 }
1593
1594 osd_req->r_callback = rbd_osd_req_callback;
1595 osd_req->r_priv = obj_request;
1596
1597 osd_req->r_oid_len = strlen(obj_request->object_name);
1598 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1599 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1600
1601 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1602
1603 /* osd_req will get its own reference to snapc (if non-null) */
1604
1605 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1606 snapc, snap_id, mtime);
1607
1608 return osd_req;
1609}
1610
1611static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1612{
1613 ceph_osdc_put_request(osd_req);
1614}
1615
1616/* object_name is assumed to be a non-null pointer and NUL-terminated */
1617
1618static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1619 u64 offset, u64 length,
1620 enum obj_request_type type)
1621{
1622 struct rbd_obj_request *obj_request;
1623 size_t size;
1624 char *name;
1625
1626 rbd_assert(obj_request_type_valid(type));
1627
1628 size = strlen(object_name) + 1;
1629 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1630 if (!obj_request)
1631 return NULL;
1632
1633 name = (char *)(obj_request + 1);
1634 obj_request->object_name = memcpy(name, object_name, size);
1635 obj_request->offset = offset;
1636 obj_request->length = length;
1637 obj_request->which = BAD_WHICH;
1638 obj_request->type = type;
1639 INIT_LIST_HEAD(&obj_request->links);
1640 atomic_set(&obj_request->done, 0);
Alex Elder788e2df2013-01-17 12:25:27 -06001641 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001642 kref_init(&obj_request->kref);
1643
1644 return obj_request;
1645}
1646
1647static void rbd_obj_request_destroy(struct kref *kref)
1648{
1649 struct rbd_obj_request *obj_request;
1650
1651 obj_request = container_of(kref, struct rbd_obj_request, kref);
1652
1653 rbd_assert(obj_request->img_request == NULL);
1654 rbd_assert(obj_request->which == BAD_WHICH);
1655
1656 if (obj_request->osd_req)
1657 rbd_osd_req_destroy(obj_request->osd_req);
1658
1659 rbd_assert(obj_request_type_valid(obj_request->type));
1660 switch (obj_request->type) {
1661 case OBJ_REQUEST_BIO:
1662 if (obj_request->bio_list)
1663 bio_chain_put(obj_request->bio_list);
1664 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001665 case OBJ_REQUEST_PAGES:
1666 if (obj_request->pages)
1667 ceph_release_page_vector(obj_request->pages,
1668 obj_request->page_count);
1669 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001670 }
1671
1672 kfree(obj_request);
1673}
1674
1675/*
1676 * Caller is responsible for filling in the list of object requests
1677 * that comprises the image request, and the Linux request pointer
1678 * (if there is one).
1679 */
1680struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1681 u64 offset, u64 length,
1682 bool write_request)
1683{
1684 struct rbd_img_request *img_request;
1685 struct ceph_snap_context *snapc = NULL;
1686
1687 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1688 if (!img_request)
1689 return NULL;
1690
1691 if (write_request) {
1692 down_read(&rbd_dev->header_rwsem);
1693 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1694 up_read(&rbd_dev->header_rwsem);
1695 if (WARN_ON(!snapc)) {
1696 kfree(img_request);
1697 return NULL; /* Shouldn't happen */
1698 }
1699 }
1700
1701 img_request->rq = NULL;
1702 img_request->rbd_dev = rbd_dev;
1703 img_request->offset = offset;
1704 img_request->length = length;
1705 img_request->write_request = write_request;
1706 if (write_request)
1707 img_request->snapc = snapc;
1708 else
1709 img_request->snap_id = rbd_dev->spec->snap_id;
1710 spin_lock_init(&img_request->completion_lock);
1711 img_request->next_completion = 0;
1712 img_request->callback = NULL;
1713 img_request->obj_request_count = 0;
1714 INIT_LIST_HEAD(&img_request->obj_requests);
1715 kref_init(&img_request->kref);
1716
1717 rbd_img_request_get(img_request); /* Avoid a warning */
1718 rbd_img_request_put(img_request); /* TEMPORARY */
1719
1720 return img_request;
1721}
1722
1723static void rbd_img_request_destroy(struct kref *kref)
1724{
1725 struct rbd_img_request *img_request;
1726 struct rbd_obj_request *obj_request;
1727 struct rbd_obj_request *next_obj_request;
1728
1729 img_request = container_of(kref, struct rbd_img_request, kref);
1730
1731 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1732 rbd_img_obj_request_del(img_request, obj_request);
1733
1734 if (img_request->write_request)
1735 ceph_put_snap_context(img_request->snapc);
1736
1737 kfree(img_request);
1738}
1739
1740static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1741 struct bio *bio_list)
1742{
1743 struct rbd_device *rbd_dev = img_request->rbd_dev;
1744 struct rbd_obj_request *obj_request = NULL;
1745 struct rbd_obj_request *next_obj_request;
1746 unsigned int bio_offset;
1747 u64 image_offset;
1748 u64 resid;
1749 u16 opcode;
1750
1751 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1752 : CEPH_OSD_OP_READ;
1753 bio_offset = 0;
1754 image_offset = img_request->offset;
1755 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1756 resid = img_request->length;
1757 while (resid) {
1758 const char *object_name;
1759 unsigned int clone_size;
1760 struct ceph_osd_req_op *op;
1761 u64 offset;
1762 u64 length;
1763
1764 object_name = rbd_segment_name(rbd_dev, image_offset);
1765 if (!object_name)
1766 goto out_unwind;
1767 offset = rbd_segment_offset(rbd_dev, image_offset);
1768 length = rbd_segment_length(rbd_dev, image_offset, resid);
1769 obj_request = rbd_obj_request_create(object_name,
1770 offset, length,
1771 OBJ_REQUEST_BIO);
1772 kfree(object_name); /* object request has its own copy */
1773 if (!obj_request)
1774 goto out_unwind;
1775
1776 rbd_assert(length <= (u64) UINT_MAX);
1777 clone_size = (unsigned int) length;
1778 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1779 &bio_offset, clone_size,
1780 GFP_ATOMIC);
1781 if (!obj_request->bio_list)
1782 goto out_partial;
1783
1784 /*
1785 * Build up the op to use in building the osd
1786 * request. Note that the contents of the op are
1787 * copied by rbd_osd_req_create().
1788 */
1789 op = rbd_osd_req_op_create(opcode, offset, length);
1790 if (!op)
1791 goto out_partial;
1792 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1793 img_request->write_request,
1794 obj_request, op);
1795 rbd_osd_req_op_destroy(op);
1796 if (!obj_request->osd_req)
1797 goto out_partial;
1798 /* status and version are initially zero-filled */
1799
1800 rbd_img_obj_request_add(img_request, obj_request);
1801
1802 image_offset += length;
1803 resid -= length;
1804 }
1805
1806 return 0;
1807
1808out_partial:
1809 rbd_obj_request_put(obj_request);
1810out_unwind:
1811 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1812 rbd_obj_request_put(obj_request);
1813
1814 return -ENOMEM;
1815}
1816
1817static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1818{
1819 struct rbd_img_request *img_request;
1820 u32 which = obj_request->which;
1821 bool more = true;
1822
1823 img_request = obj_request->img_request;
1824 rbd_assert(img_request != NULL);
1825 rbd_assert(img_request->rq != NULL);
1826 rbd_assert(which != BAD_WHICH);
1827 rbd_assert(which < img_request->obj_request_count);
1828 rbd_assert(which >= img_request->next_completion);
1829
1830 spin_lock_irq(&img_request->completion_lock);
1831 if (which != img_request->next_completion)
1832 goto out;
1833
1834 for_each_obj_request_from(img_request, obj_request) {
1835 unsigned int xferred;
1836 int result;
1837
1838 rbd_assert(more);
1839 rbd_assert(which < img_request->obj_request_count);
1840
1841 if (!atomic_read(&obj_request->done))
1842 break;
1843
1844 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1845 xferred = (unsigned int) obj_request->xferred;
1846 result = (int) obj_request->result;
1847 if (result)
1848 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1849 img_request->write_request ? "write" : "read",
1850 result, xferred);
1851
1852 more = blk_end_request(img_request->rq, result, xferred);
1853 which++;
1854 }
1855 rbd_assert(more ^ (which == img_request->obj_request_count));
1856 img_request->next_completion = which;
1857out:
1858 spin_unlock_irq(&img_request->completion_lock);
1859
1860 if (!more)
1861 rbd_img_request_complete(img_request);
1862}
1863
1864static int rbd_img_request_submit(struct rbd_img_request *img_request)
1865{
1866 struct rbd_device *rbd_dev = img_request->rbd_dev;
1867 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1868 struct rbd_obj_request *obj_request;
1869
1870 for_each_obj_request(img_request, obj_request) {
1871 int ret;
1872
1873 obj_request->callback = rbd_img_obj_callback;
1874 ret = rbd_obj_request_submit(osdc, obj_request);
1875 if (ret)
1876 return ret;
1877 /*
1878 * The image request has its own reference to each
1879 * of its object requests, so we can safely drop the
1880 * initial one here.
1881 */
1882 rbd_obj_request_put(obj_request);
1883 }
1884
1885 return 0;
1886}
1887
1888static void rbd_request_fn(struct request_queue *q)
1889{
1890 struct rbd_device *rbd_dev = q->queuedata;
1891 bool read_only = rbd_dev->mapping.read_only;
1892 struct request *rq;
1893 int result;
1894
1895 while ((rq = blk_fetch_request(q))) {
1896 bool write_request = rq_data_dir(rq) == WRITE;
1897 struct rbd_img_request *img_request;
1898 u64 offset;
1899 u64 length;
1900
1901 /* Ignore any non-FS requests that filter through. */
1902
1903 if (rq->cmd_type != REQ_TYPE_FS) {
1904 __blk_end_request_all(rq, 0);
1905 continue;
1906 }
1907
1908 spin_unlock_irq(q->queue_lock);
1909
1910 /* Disallow writes to a read-only device */
1911
1912 if (write_request) {
1913 result = -EROFS;
1914 if (read_only)
1915 goto end_request;
1916 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1917 }
1918
1919 /* Quit early if the snapshot has disappeared */
1920
1921 if (!atomic_read(&rbd_dev->exists)) {
1922 dout("request for non-existent snapshot");
1923 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1924 result = -ENXIO;
1925 goto end_request;
1926 }
1927
1928 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1929 length = (u64) blk_rq_bytes(rq);
1930
1931 result = -EINVAL;
1932 if (WARN_ON(offset && length > U64_MAX - offset + 1))
1933 goto end_request; /* Shouldn't happen */
1934
1935 result = -ENOMEM;
1936 img_request = rbd_img_request_create(rbd_dev, offset, length,
1937 write_request);
1938 if (!img_request)
1939 goto end_request;
1940
1941 img_request->rq = rq;
1942
1943 result = rbd_img_request_fill_bio(img_request, rq->bio);
1944 if (!result)
1945 result = rbd_img_request_submit(img_request);
1946 if (result)
1947 rbd_img_request_put(img_request);
1948end_request:
1949 spin_lock_irq(q->queue_lock);
1950 if (result < 0) {
1951 rbd_warn(rbd_dev, "obj_request %s result %d\n",
1952 write_request ? "write" : "read", result);
1953 __blk_end_request_all(rq, result);
1954 }
1955 }
1956}
1957
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001958/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001959 * a queue callback. Makes sure that we don't create a bio that spans across
1960 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001961 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001962 */
1963static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1964 struct bio_vec *bvec)
1965{
1966 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001967 sector_t sector_offset;
1968 sector_t sectors_per_obj;
1969 sector_t obj_sector_offset;
1970 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001971
Alex Eldere5cfeed22012-10-20 22:17:27 -05001972 /*
1973 * Find how far into its rbd object the partition-relative
1974 * bio start sector is to offset relative to the enclosing
1975 * device.
1976 */
1977 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1978 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1979 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001980
Alex Eldere5cfeed22012-10-20 22:17:27 -05001981 /*
1982 * Compute the number of bytes from that offset to the end
1983 * of the object. Account for what's already used by the bio.
1984 */
1985 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1986 if (ret > bmd->bi_size)
1987 ret -= bmd->bi_size;
1988 else
1989 ret = 0;
1990
1991 /*
1992 * Don't send back more than was asked for. And if the bio
1993 * was empty, let the whole thing through because: "Note
1994 * that a block device *must* allow a single page to be
1995 * added to an empty bio."
1996 */
1997 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1998 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1999 ret = (int) bvec->bv_len;
2000
2001 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002002}
2003
2004static void rbd_free_disk(struct rbd_device *rbd_dev)
2005{
2006 struct gendisk *disk = rbd_dev->disk;
2007
2008 if (!disk)
2009 return;
2010
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002011 if (disk->flags & GENHD_FL_UP)
2012 del_gendisk(disk);
2013 if (disk->queue)
2014 blk_cleanup_queue(disk->queue);
2015 put_disk(disk);
2016}
2017
Alex Elder788e2df2013-01-17 12:25:27 -06002018static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2019 const char *object_name,
2020 u64 offset, u64 length,
2021 char *buf, u64 *version)
2022
2023{
2024 struct ceph_osd_req_op *op;
2025 struct rbd_obj_request *obj_request;
2026 struct ceph_osd_client *osdc;
2027 struct page **pages = NULL;
2028 u32 page_count;
2029 int ret;
2030
2031 page_count = (u32) calc_pages_for(offset, length);
2032 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2033 if (IS_ERR(pages))
2034 ret = PTR_ERR(pages);
2035
2036 ret = -ENOMEM;
2037 obj_request = rbd_obj_request_create(object_name, offset, length,
2038 OBJ_REQUEST_PAGES);
2039 if (!obj_request)
2040 goto out;
2041
2042 obj_request->pages = pages;
2043 obj_request->page_count = page_count;
2044
2045 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2046 if (!op)
2047 goto out;
2048 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2049 obj_request, op);
2050 rbd_osd_req_op_destroy(op);
2051 if (!obj_request->osd_req)
2052 goto out;
2053
2054 osdc = &rbd_dev->rbd_client->client->osdc;
2055 ret = rbd_obj_request_submit(osdc, obj_request);
2056 if (ret)
2057 goto out;
2058 ret = rbd_obj_request_wait(obj_request);
2059 if (ret)
2060 goto out;
2061
2062 ret = obj_request->result;
2063 if (ret < 0)
2064 goto out;
2065 ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
2066 if (version)
2067 *version = obj_request->version;
2068out:
2069 if (obj_request)
2070 rbd_obj_request_put(obj_request);
2071 else
2072 ceph_release_page_vector(pages, page_count);
2073
2074 return ret;
2075}
2076
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002077/*
Alex Elder4156d992012-08-02 11:29:46 -05002078 * Read the complete header for the given rbd device.
2079 *
2080 * Returns a pointer to a dynamically-allocated buffer containing
2081 * the complete and validated header. Caller can pass the address
2082 * of a variable that will be filled in with the version of the
2083 * header object at the time it was read.
2084 *
2085 * Returns a pointer-coded errno if a failure occurs.
2086 */
2087static struct rbd_image_header_ondisk *
2088rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2089{
2090 struct rbd_image_header_ondisk *ondisk = NULL;
2091 u32 snap_count = 0;
2092 u64 names_size = 0;
2093 u32 want_count;
2094 int ret;
2095
2096 /*
2097 * The complete header will include an array of its 64-bit
2098 * snapshot ids, followed by the names of those snapshots as
2099 * a contiguous block of NUL-terminated strings. Note that
2100 * the number of snapshots could change by the time we read
2101 * it in, in which case we re-read it.
2102 */
2103 do {
2104 size_t size;
2105
2106 kfree(ondisk);
2107
2108 size = sizeof (*ondisk);
2109 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2110 size += names_size;
2111 ondisk = kmalloc(size, GFP_KERNEL);
2112 if (!ondisk)
2113 return ERR_PTR(-ENOMEM);
2114
Alex Elder788e2df2013-01-17 12:25:27 -06002115 (void) rbd_req_sync_read; /* avoid a warning */
2116 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002117 0, size,
2118 (char *) ondisk, version);
2119
2120 if (ret < 0)
2121 goto out_err;
2122 if (WARN_ON((size_t) ret < size)) {
2123 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002124 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2125 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002126 goto out_err;
2127 }
2128 if (!rbd_dev_ondisk_valid(ondisk)) {
2129 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002130 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002131 goto out_err;
2132 }
2133
2134 names_size = le64_to_cpu(ondisk->snap_names_len);
2135 want_count = snap_count;
2136 snap_count = le32_to_cpu(ondisk->snap_count);
2137 } while (snap_count != want_count);
2138
2139 return ondisk;
2140
2141out_err:
2142 kfree(ondisk);
2143
2144 return ERR_PTR(ret);
2145}
2146
2147/*
2148 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002149 */
2150static int rbd_read_header(struct rbd_device *rbd_dev,
2151 struct rbd_image_header *header)
2152{
Alex Elder4156d992012-08-02 11:29:46 -05002153 struct rbd_image_header_ondisk *ondisk;
2154 u64 ver = 0;
2155 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002156
Alex Elder4156d992012-08-02 11:29:46 -05002157 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2158 if (IS_ERR(ondisk))
2159 return PTR_ERR(ondisk);
2160 ret = rbd_header_from_disk(header, ondisk);
2161 if (ret >= 0)
2162 header->obj_version = ver;
2163 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002164
Alex Elder4156d992012-08-02 11:29:46 -05002165 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002166}
2167
Alex Elder41f38c22012-10-25 23:34:40 -05002168static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002169{
2170 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002171 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002172
Alex Eldera0593292012-07-19 09:09:27 -05002173 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002174 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002175}
2176
Alex Elder94785542012-10-09 13:50:17 -07002177static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2178{
2179 sector_t size;
2180
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002181 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002182 return;
2183
2184 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2185 dout("setting size to %llu sectors", (unsigned long long) size);
2186 rbd_dev->mapping.size = (u64) size;
2187 set_capacity(rbd_dev->disk, size);
2188}
2189
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002190/*
2191 * only read the first part of the ondisk header, without the snaps info
2192 */
Alex Elder117973f2012-08-31 17:29:55 -05002193static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002194{
2195 int ret;
2196 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002197
2198 ret = rbd_read_header(rbd_dev, &h);
2199 if (ret < 0)
2200 return ret;
2201
Josh Durgina51aa0c2011-12-05 10:35:04 -08002202 down_write(&rbd_dev->header_rwsem);
2203
Alex Elder94785542012-10-09 13:50:17 -07002204 /* Update image size, and check for resize of mapped image */
2205 rbd_dev->header.image_size = h.image_size;
2206 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07002207
Alex Elder849b4262012-07-09 21:04:24 -05002208 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002209 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05002210 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08002211 /* osd requests may still refer to snapc */
2212 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002213
Alex Elderb8136232012-07-25 09:32:41 -05002214 if (hver)
2215 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08002216 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08002217 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002218 rbd_dev->header.snapc = h.snapc;
2219 rbd_dev->header.snap_names = h.snap_names;
2220 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05002221 /* Free the extra copy of the object prefix */
2222 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2223 kfree(h.object_prefix);
2224
Alex Elder304f6802012-08-31 17:29:52 -05002225 ret = rbd_dev_snaps_update(rbd_dev);
2226 if (!ret)
2227 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002228
Josh Durginc6666012011-11-21 17:11:12 -08002229 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002230
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002231 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002232}
2233
Alex Elder117973f2012-08-31 17:29:55 -05002234static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05002235{
2236 int ret;
2237
Alex Elder117973f2012-08-31 17:29:55 -05002238 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05002239 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05002240 if (rbd_dev->image_format == 1)
2241 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2242 else
2243 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05002244 mutex_unlock(&ctl_mutex);
2245
2246 return ret;
2247}
2248
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002249static int rbd_init_disk(struct rbd_device *rbd_dev)
2250{
2251 struct gendisk *disk;
2252 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06002253 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002254
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002255 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002256 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2257 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002258 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002259
Alex Elderf0f8cef2012-01-29 13:57:44 -06002260 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05002261 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002262 disk->major = rbd_dev->major;
2263 disk->first_minor = 0;
2264 disk->fops = &rbd_bd_ops;
2265 disk->private_data = rbd_dev;
2266
Alex Elderbf0d5f502012-11-22 00:00:08 -06002267 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002268 if (!q)
2269 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07002270
Alex Elder593a9e72012-02-07 12:03:37 -06002271 /* We use the default size, but let's be explicit about it. */
2272 blk_queue_physical_block_size(q, SECTOR_SIZE);
2273
Josh Durgin029bcbd2011-07-22 11:35:23 -07002274 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06002275 segment_size = rbd_obj_bytes(&rbd_dev->header);
2276 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2277 blk_queue_max_segment_size(q, segment_size);
2278 blk_queue_io_min(q, segment_size);
2279 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07002280
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002281 blk_queue_merge_bvec(q, rbd_merge_bvec);
2282 disk->queue = q;
2283
2284 q->queuedata = rbd_dev;
2285
2286 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002287
Alex Elder12f02942012-08-29 17:11:07 -05002288 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2289
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002290 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002291out_disk:
2292 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002293
2294 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002295}
2296
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002297/*
2298 sysfs
2299*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002300
Alex Elder593a9e72012-02-07 12:03:37 -06002301static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2302{
2303 return container_of(dev, struct rbd_device, dev);
2304}
2305
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002306static ssize_t rbd_size_show(struct device *dev,
2307 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002308{
Alex Elder593a9e72012-02-07 12:03:37 -06002309 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08002310 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002311
Josh Durgina51aa0c2011-12-05 10:35:04 -08002312 down_read(&rbd_dev->header_rwsem);
2313 size = get_capacity(rbd_dev->disk);
2314 up_read(&rbd_dev->header_rwsem);
2315
2316 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002317}
2318
Alex Elder34b13182012-07-13 20:35:12 -05002319/*
2320 * Note this shows the features for whatever's mapped, which is not
2321 * necessarily the base image.
2322 */
2323static ssize_t rbd_features_show(struct device *dev,
2324 struct device_attribute *attr, char *buf)
2325{
2326 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2327
2328 return sprintf(buf, "0x%016llx\n",
2329 (unsigned long long) rbd_dev->mapping.features);
2330}
2331
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002332static ssize_t rbd_major_show(struct device *dev,
2333 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002334{
Alex Elder593a9e72012-02-07 12:03:37 -06002335 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002336
2337 return sprintf(buf, "%d\n", rbd_dev->major);
2338}
2339
2340static ssize_t rbd_client_id_show(struct device *dev,
2341 struct device_attribute *attr, char *buf)
2342{
Alex Elder593a9e72012-02-07 12:03:37 -06002343 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002344
Alex Elder1dbb4392012-01-24 10:08:37 -06002345 return sprintf(buf, "client%lld\n",
2346 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002347}
2348
2349static ssize_t rbd_pool_show(struct device *dev,
2350 struct device_attribute *attr, char *buf)
2351{
Alex Elder593a9e72012-02-07 12:03:37 -06002352 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002353
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002354 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002355}
2356
Alex Elder9bb2f332012-07-12 10:46:35 -05002357static ssize_t rbd_pool_id_show(struct device *dev,
2358 struct device_attribute *attr, char *buf)
2359{
2360 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2361
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002362 return sprintf(buf, "%llu\n",
2363 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002364}
2365
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002366static ssize_t rbd_name_show(struct device *dev,
2367 struct device_attribute *attr, char *buf)
2368{
Alex Elder593a9e72012-02-07 12:03:37 -06002369 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002370
Alex Eldera92ffdf2012-10-30 19:40:33 -05002371 if (rbd_dev->spec->image_name)
2372 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2373
2374 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002375}
2376
Alex Elder589d30e2012-07-10 20:30:11 -05002377static ssize_t rbd_image_id_show(struct device *dev,
2378 struct device_attribute *attr, char *buf)
2379{
2380 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2381
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002382 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002383}
2384
Alex Elder34b13182012-07-13 20:35:12 -05002385/*
2386 * Shows the name of the currently-mapped snapshot (or
2387 * RBD_SNAP_HEAD_NAME for the base image).
2388 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002389static ssize_t rbd_snap_show(struct device *dev,
2390 struct device_attribute *attr,
2391 char *buf)
2392{
Alex Elder593a9e72012-02-07 12:03:37 -06002393 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002394
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002395 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002396}
2397
Alex Elder86b00e02012-10-25 23:34:42 -05002398/*
2399 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2400 * for the parent image. If there is no parent, simply shows
2401 * "(no parent image)".
2402 */
2403static ssize_t rbd_parent_show(struct device *dev,
2404 struct device_attribute *attr,
2405 char *buf)
2406{
2407 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2408 struct rbd_spec *spec = rbd_dev->parent_spec;
2409 int count;
2410 char *bufp = buf;
2411
2412 if (!spec)
2413 return sprintf(buf, "(no parent image)\n");
2414
2415 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2416 (unsigned long long) spec->pool_id, spec->pool_name);
2417 if (count < 0)
2418 return count;
2419 bufp += count;
2420
2421 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2422 spec->image_name ? spec->image_name : "(unknown)");
2423 if (count < 0)
2424 return count;
2425 bufp += count;
2426
2427 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2428 (unsigned long long) spec->snap_id, spec->snap_name);
2429 if (count < 0)
2430 return count;
2431 bufp += count;
2432
2433 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2434 if (count < 0)
2435 return count;
2436 bufp += count;
2437
2438 return (ssize_t) (bufp - buf);
2439}
2440
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002441static ssize_t rbd_image_refresh(struct device *dev,
2442 struct device_attribute *attr,
2443 const char *buf,
2444 size_t size)
2445{
Alex Elder593a9e72012-02-07 12:03:37 -06002446 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002447 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002448
Alex Elder117973f2012-08-31 17:29:55 -05002449 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002450
2451 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002452}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002453
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002454static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002455static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002456static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2457static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2458static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002459static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002460static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002461static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002462static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2463static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002464static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002465
2466static struct attribute *rbd_attrs[] = {
2467 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002468 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002469 &dev_attr_major.attr,
2470 &dev_attr_client_id.attr,
2471 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002472 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002473 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002474 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002475 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002476 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002477 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002478 NULL
2479};
2480
2481static struct attribute_group rbd_attr_group = {
2482 .attrs = rbd_attrs,
2483};
2484
2485static const struct attribute_group *rbd_attr_groups[] = {
2486 &rbd_attr_group,
2487 NULL
2488};
2489
2490static void rbd_sysfs_dev_release(struct device *dev)
2491{
2492}
2493
2494static struct device_type rbd_device_type = {
2495 .name = "rbd",
2496 .groups = rbd_attr_groups,
2497 .release = rbd_sysfs_dev_release,
2498};
2499
2500
2501/*
2502 sysfs - snapshots
2503*/
2504
2505static ssize_t rbd_snap_size_show(struct device *dev,
2506 struct device_attribute *attr,
2507 char *buf)
2508{
2509 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2510
Josh Durgin35915382011-12-05 18:25:13 -08002511 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002512}
2513
2514static ssize_t rbd_snap_id_show(struct device *dev,
2515 struct device_attribute *attr,
2516 char *buf)
2517{
2518 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2519
Josh Durgin35915382011-12-05 18:25:13 -08002520 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002521}
2522
Alex Elder34b13182012-07-13 20:35:12 -05002523static ssize_t rbd_snap_features_show(struct device *dev,
2524 struct device_attribute *attr,
2525 char *buf)
2526{
2527 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2528
2529 return sprintf(buf, "0x%016llx\n",
2530 (unsigned long long) snap->features);
2531}
2532
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002533static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2534static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002535static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002536
2537static struct attribute *rbd_snap_attrs[] = {
2538 &dev_attr_snap_size.attr,
2539 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002540 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002541 NULL,
2542};
2543
2544static struct attribute_group rbd_snap_attr_group = {
2545 .attrs = rbd_snap_attrs,
2546};
2547
2548static void rbd_snap_dev_release(struct device *dev)
2549{
2550 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2551 kfree(snap->name);
2552 kfree(snap);
2553}
2554
2555static const struct attribute_group *rbd_snap_attr_groups[] = {
2556 &rbd_snap_attr_group,
2557 NULL
2558};
2559
2560static struct device_type rbd_snap_device_type = {
2561 .groups = rbd_snap_attr_groups,
2562 .release = rbd_snap_dev_release,
2563};
2564
Alex Elder8b8fb992012-10-26 17:25:24 -05002565static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2566{
2567 kref_get(&spec->kref);
2568
2569 return spec;
2570}
2571
2572static void rbd_spec_free(struct kref *kref);
2573static void rbd_spec_put(struct rbd_spec *spec)
2574{
2575 if (spec)
2576 kref_put(&spec->kref, rbd_spec_free);
2577}
2578
2579static struct rbd_spec *rbd_spec_alloc(void)
2580{
2581 struct rbd_spec *spec;
2582
2583 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2584 if (!spec)
2585 return NULL;
2586 kref_init(&spec->kref);
2587
2588 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2589
2590 return spec;
2591}
2592
2593static void rbd_spec_free(struct kref *kref)
2594{
2595 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2596
2597 kfree(spec->pool_name);
2598 kfree(spec->image_id);
2599 kfree(spec->image_name);
2600 kfree(spec->snap_name);
2601 kfree(spec);
2602}
2603
Alex Elderc53d5892012-10-25 23:34:42 -05002604struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2605 struct rbd_spec *spec)
2606{
2607 struct rbd_device *rbd_dev;
2608
2609 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2610 if (!rbd_dev)
2611 return NULL;
2612
2613 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002614 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002615 INIT_LIST_HEAD(&rbd_dev->node);
2616 INIT_LIST_HEAD(&rbd_dev->snaps);
2617 init_rwsem(&rbd_dev->header_rwsem);
2618
2619 rbd_dev->spec = spec;
2620 rbd_dev->rbd_client = rbdc;
2621
Alex Elder0903e872012-11-14 12:25:19 -06002622 /* Initialize the layout used for all rbd requests */
2623
2624 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2625 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2626 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2627 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2628
Alex Elderc53d5892012-10-25 23:34:42 -05002629 return rbd_dev;
2630}
2631
2632static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2633{
Alex Elder86b00e02012-10-25 23:34:42 -05002634 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002635 kfree(rbd_dev->header_name);
2636 rbd_put_client(rbd_dev->rbd_client);
2637 rbd_spec_put(rbd_dev->spec);
2638 kfree(rbd_dev);
2639}
2640
Alex Elder304f6802012-08-31 17:29:52 -05002641static bool rbd_snap_registered(struct rbd_snap *snap)
2642{
2643 bool ret = snap->dev.type == &rbd_snap_device_type;
2644 bool reg = device_is_registered(&snap->dev);
2645
2646 rbd_assert(!ret ^ reg);
2647
2648 return ret;
2649}
2650
Alex Elder41f38c22012-10-25 23:34:40 -05002651static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002652{
2653 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002654 if (device_is_registered(&snap->dev))
2655 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002656}
2657
Alex Elder14e70852012-07-19 09:09:27 -05002658static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002659 struct device *parent)
2660{
2661 struct device *dev = &snap->dev;
2662 int ret;
2663
2664 dev->type = &rbd_snap_device_type;
2665 dev->parent = parent;
2666 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002667 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002668 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2669
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002670 ret = device_register(dev);
2671
2672 return ret;
2673}
2674
Alex Elder4e891e02012-07-10 20:30:10 -05002675static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002676 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002677 u64 snap_id, u64 snap_size,
2678 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002679{
Alex Elder4e891e02012-07-10 20:30:10 -05002680 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002681 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002682
2683 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002684 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002685 return ERR_PTR(-ENOMEM);
2686
2687 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002688 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002689 if (!snap->name)
2690 goto err;
2691
Alex Elderc8d18422012-07-10 20:30:11 -05002692 snap->id = snap_id;
2693 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002694 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002695
2696 return snap;
2697
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002698err:
2699 kfree(snap->name);
2700 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002701
2702 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002703}
2704
Alex Eldercd892122012-07-03 16:01:19 -05002705static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2706 u64 *snap_size, u64 *snap_features)
2707{
2708 char *snap_name;
2709
2710 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2711
2712 *snap_size = rbd_dev->header.snap_sizes[which];
2713 *snap_features = 0; /* No features for v1 */
2714
2715 /* Skip over names until we find the one we are looking for */
2716
2717 snap_name = rbd_dev->header.snap_names;
2718 while (which--)
2719 snap_name += strlen(snap_name) + 1;
2720
2721 return snap_name;
2722}
2723
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002724/*
Alex Elder9d475de2012-07-03 16:01:19 -05002725 * Get the size and object order for an image snapshot, or if
2726 * snap_id is CEPH_NOSNAP, gets this information for the base
2727 * image.
2728 */
2729static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2730 u8 *order, u64 *snap_size)
2731{
2732 __le64 snapid = cpu_to_le64(snap_id);
2733 int ret;
2734 struct {
2735 u8 order;
2736 __le64 size;
2737 } __attribute__ ((packed)) size_buf = { 0 };
2738
2739 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2740 "rbd", "get_size",
2741 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002742 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder9d475de2012-07-03 16:01:19 -05002743 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2744 if (ret < 0)
2745 return ret;
2746
2747 *order = size_buf.order;
2748 *snap_size = le64_to_cpu(size_buf.size);
2749
2750 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2751 (unsigned long long) snap_id, (unsigned int) *order,
2752 (unsigned long long) *snap_size);
2753
2754 return 0;
2755}
2756
2757static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2758{
2759 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2760 &rbd_dev->header.obj_order,
2761 &rbd_dev->header.image_size);
2762}
2763
Alex Elder1e130192012-07-03 16:01:19 -05002764static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2765{
2766 void *reply_buf;
2767 int ret;
2768 void *p;
2769
2770 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2771 if (!reply_buf)
2772 return -ENOMEM;
2773
2774 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2775 "rbd", "get_object_prefix",
2776 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002777 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder1e130192012-07-03 16:01:19 -05002778 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2779 if (ret < 0)
2780 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002781 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002782
2783 p = reply_buf;
2784 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2785 p + RBD_OBJ_PREFIX_LEN_MAX,
2786 NULL, GFP_NOIO);
2787
2788 if (IS_ERR(rbd_dev->header.object_prefix)) {
2789 ret = PTR_ERR(rbd_dev->header.object_prefix);
2790 rbd_dev->header.object_prefix = NULL;
2791 } else {
2792 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2793 }
2794
2795out:
2796 kfree(reply_buf);
2797
2798 return ret;
2799}
2800
Alex Elderb1b54022012-07-03 16:01:19 -05002801static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2802 u64 *snap_features)
2803{
2804 __le64 snapid = cpu_to_le64(snap_id);
2805 struct {
2806 __le64 features;
2807 __le64 incompat;
2808 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002809 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002810 int ret;
2811
2812 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2813 "rbd", "get_features",
2814 (char *) &snapid, sizeof (snapid),
2815 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002816 NULL);
Alex Elderb1b54022012-07-03 16:01:19 -05002817 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2818 if (ret < 0)
2819 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002820
2821 incompat = le64_to_cpu(features_buf.incompat);
2822 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002823 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002824
Alex Elderb1b54022012-07-03 16:01:19 -05002825 *snap_features = le64_to_cpu(features_buf.features);
2826
2827 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2828 (unsigned long long) snap_id,
2829 (unsigned long long) *snap_features,
2830 (unsigned long long) le64_to_cpu(features_buf.incompat));
2831
2832 return 0;
2833}
2834
2835static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2836{
2837 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2838 &rbd_dev->header.features);
2839}
2840
Alex Elder86b00e02012-10-25 23:34:42 -05002841static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2842{
2843 struct rbd_spec *parent_spec;
2844 size_t size;
2845 void *reply_buf = NULL;
2846 __le64 snapid;
2847 void *p;
2848 void *end;
2849 char *image_id;
2850 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002851 int ret;
2852
2853 parent_spec = rbd_spec_alloc();
2854 if (!parent_spec)
2855 return -ENOMEM;
2856
2857 size = sizeof (__le64) + /* pool_id */
2858 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2859 sizeof (__le64) + /* snap_id */
2860 sizeof (__le64); /* overlap */
2861 reply_buf = kmalloc(size, GFP_KERNEL);
2862 if (!reply_buf) {
2863 ret = -ENOMEM;
2864 goto out_err;
2865 }
2866
2867 snapid = cpu_to_le64(CEPH_NOSNAP);
2868 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2869 "rbd", "get_parent",
2870 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002871 (char *) reply_buf, size, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002872 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2873 if (ret < 0)
2874 goto out_err;
2875
2876 ret = -ERANGE;
2877 p = reply_buf;
2878 end = (char *) reply_buf + size;
2879 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2880 if (parent_spec->pool_id == CEPH_NOPOOL)
2881 goto out; /* No parent? No problem. */
2882
Alex Elder0903e872012-11-14 12:25:19 -06002883 /* The ceph file layout needs to fit pool id in 32 bits */
2884
2885 ret = -EIO;
2886 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2887 goto out;
2888
Alex Elder979ed482012-11-01 08:39:26 -05002889 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002890 if (IS_ERR(image_id)) {
2891 ret = PTR_ERR(image_id);
2892 goto out_err;
2893 }
2894 parent_spec->image_id = image_id;
2895 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2896 ceph_decode_64_safe(&p, end, overlap, out_err);
2897
2898 rbd_dev->parent_overlap = overlap;
2899 rbd_dev->parent_spec = parent_spec;
2900 parent_spec = NULL; /* rbd_dev now owns this */
2901out:
2902 ret = 0;
2903out_err:
2904 kfree(reply_buf);
2905 rbd_spec_put(parent_spec);
2906
2907 return ret;
2908}
2909
Alex Elder9e15b772012-10-30 19:40:33 -05002910static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2911{
2912 size_t image_id_size;
2913 char *image_id;
2914 void *p;
2915 void *end;
2916 size_t size;
2917 void *reply_buf = NULL;
2918 size_t len = 0;
2919 char *image_name = NULL;
2920 int ret;
2921
2922 rbd_assert(!rbd_dev->spec->image_name);
2923
Alex Elder69e7a022012-11-01 08:39:26 -05002924 len = strlen(rbd_dev->spec->image_id);
2925 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002926 image_id = kmalloc(image_id_size, GFP_KERNEL);
2927 if (!image_id)
2928 return NULL;
2929
2930 p = image_id;
2931 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002932 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002933
2934 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2935 reply_buf = kmalloc(size, GFP_KERNEL);
2936 if (!reply_buf)
2937 goto out;
2938
2939 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2940 "rbd", "dir_get_name",
2941 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06002942 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05002943 if (ret < 0)
2944 goto out;
2945 p = reply_buf;
2946 end = (char *) reply_buf + size;
2947 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2948 if (IS_ERR(image_name))
2949 image_name = NULL;
2950 else
2951 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2952out:
2953 kfree(reply_buf);
2954 kfree(image_id);
2955
2956 return image_name;
2957}
2958
2959/*
2960 * When a parent image gets probed, we only have the pool, image,
2961 * and snapshot ids but not the names of any of them. This call
2962 * is made later to fill in those names. It has to be done after
2963 * rbd_dev_snaps_update() has completed because some of the
2964 * information (in particular, snapshot name) is not available
2965 * until then.
2966 */
2967static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2968{
2969 struct ceph_osd_client *osdc;
2970 const char *name;
2971 void *reply_buf = NULL;
2972 int ret;
2973
2974 if (rbd_dev->spec->pool_name)
2975 return 0; /* Already have the names */
2976
2977 /* Look up the pool name */
2978
2979 osdc = &rbd_dev->rbd_client->client->osdc;
2980 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05002981 if (!name) {
2982 rbd_warn(rbd_dev, "there is no pool with id %llu",
2983 rbd_dev->spec->pool_id); /* Really a BUG() */
2984 return -EIO;
2985 }
Alex Elder9e15b772012-10-30 19:40:33 -05002986
2987 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2988 if (!rbd_dev->spec->pool_name)
2989 return -ENOMEM;
2990
2991 /* Fetch the image name; tolerate failure here */
2992
2993 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05002994 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05002995 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05002996 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05002997 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05002998
2999 /* Look up the snapshot name. */
3000
3001 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3002 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003003 rbd_warn(rbd_dev, "no snapshot with id %llu",
3004 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003005 ret = -EIO;
3006 goto out_err;
3007 }
3008 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3009 if(!rbd_dev->spec->snap_name)
3010 goto out_err;
3011
3012 return 0;
3013out_err:
3014 kfree(reply_buf);
3015 kfree(rbd_dev->spec->pool_name);
3016 rbd_dev->spec->pool_name = NULL;
3017
3018 return ret;
3019}
3020
Alex Elder6e14b1a2012-07-03 16:01:19 -05003021static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003022{
3023 size_t size;
3024 int ret;
3025 void *reply_buf;
3026 void *p;
3027 void *end;
3028 u64 seq;
3029 u32 snap_count;
3030 struct ceph_snap_context *snapc;
3031 u32 i;
3032
3033 /*
3034 * We'll need room for the seq value (maximum snapshot id),
3035 * snapshot count, and array of that many snapshot ids.
3036 * For now we have a fixed upper limit on the number we're
3037 * prepared to receive.
3038 */
3039 size = sizeof (__le64) + sizeof (__le32) +
3040 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3041 reply_buf = kzalloc(size, GFP_KERNEL);
3042 if (!reply_buf)
3043 return -ENOMEM;
3044
3045 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3046 "rbd", "get_snapcontext",
3047 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003048 reply_buf, size, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003049 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3050 if (ret < 0)
3051 goto out;
3052
3053 ret = -ERANGE;
3054 p = reply_buf;
3055 end = (char *) reply_buf + size;
3056 ceph_decode_64_safe(&p, end, seq, out);
3057 ceph_decode_32_safe(&p, end, snap_count, out);
3058
3059 /*
3060 * Make sure the reported number of snapshot ids wouldn't go
3061 * beyond the end of our buffer. But before checking that,
3062 * make sure the computed size of the snapshot context we
3063 * allocate is representable in a size_t.
3064 */
3065 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3066 / sizeof (u64)) {
3067 ret = -EINVAL;
3068 goto out;
3069 }
3070 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3071 goto out;
3072
3073 size = sizeof (struct ceph_snap_context) +
3074 snap_count * sizeof (snapc->snaps[0]);
3075 snapc = kmalloc(size, GFP_KERNEL);
3076 if (!snapc) {
3077 ret = -ENOMEM;
3078 goto out;
3079 }
3080
3081 atomic_set(&snapc->nref, 1);
3082 snapc->seq = seq;
3083 snapc->num_snaps = snap_count;
3084 for (i = 0; i < snap_count; i++)
3085 snapc->snaps[i] = ceph_decode_64(&p);
3086
3087 rbd_dev->header.snapc = snapc;
3088
3089 dout(" snap context seq = %llu, snap_count = %u\n",
3090 (unsigned long long) seq, (unsigned int) snap_count);
3091
3092out:
3093 kfree(reply_buf);
3094
3095 return 0;
3096}
3097
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003098static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3099{
3100 size_t size;
3101 void *reply_buf;
3102 __le64 snap_id;
3103 int ret;
3104 void *p;
3105 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003106 char *snap_name;
3107
3108 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3109 reply_buf = kmalloc(size, GFP_KERNEL);
3110 if (!reply_buf)
3111 return ERR_PTR(-ENOMEM);
3112
3113 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3114 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3115 "rbd", "get_snapshot_name",
3116 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003117 reply_buf, size, NULL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003118 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3119 if (ret < 0)
3120 goto out;
3121
3122 p = reply_buf;
3123 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003124 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003125 if (IS_ERR(snap_name)) {
3126 ret = PTR_ERR(snap_name);
3127 goto out;
3128 } else {
3129 dout(" snap_id 0x%016llx snap_name = %s\n",
3130 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3131 }
3132 kfree(reply_buf);
3133
3134 return snap_name;
3135out:
3136 kfree(reply_buf);
3137
3138 return ERR_PTR(ret);
3139}
3140
3141static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3142 u64 *snap_size, u64 *snap_features)
3143{
Alex Eldere0b49862013-01-09 14:44:18 -06003144 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003145 u8 order;
3146 int ret;
3147
3148 snap_id = rbd_dev->header.snapc->snaps[which];
3149 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3150 if (ret)
3151 return ERR_PTR(ret);
3152 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3153 if (ret)
3154 return ERR_PTR(ret);
3155
3156 return rbd_dev_v2_snap_name(rbd_dev, which);
3157}
3158
3159static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3160 u64 *snap_size, u64 *snap_features)
3161{
3162 if (rbd_dev->image_format == 1)
3163 return rbd_dev_v1_snap_info(rbd_dev, which,
3164 snap_size, snap_features);
3165 if (rbd_dev->image_format == 2)
3166 return rbd_dev_v2_snap_info(rbd_dev, which,
3167 snap_size, snap_features);
3168 return ERR_PTR(-EINVAL);
3169}
3170
Alex Elder117973f2012-08-31 17:29:55 -05003171static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3172{
3173 int ret;
3174 __u8 obj_order;
3175
3176 down_write(&rbd_dev->header_rwsem);
3177
3178 /* Grab old order first, to see if it changes */
3179
3180 obj_order = rbd_dev->header.obj_order,
3181 ret = rbd_dev_v2_image_size(rbd_dev);
3182 if (ret)
3183 goto out;
3184 if (rbd_dev->header.obj_order != obj_order) {
3185 ret = -EIO;
3186 goto out;
3187 }
3188 rbd_update_mapping_size(rbd_dev);
3189
3190 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3191 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3192 if (ret)
3193 goto out;
3194 ret = rbd_dev_snaps_update(rbd_dev);
3195 dout("rbd_dev_snaps_update returned %d\n", ret);
3196 if (ret)
3197 goto out;
3198 ret = rbd_dev_snaps_register(rbd_dev);
3199 dout("rbd_dev_snaps_register returned %d\n", ret);
3200out:
3201 up_write(&rbd_dev->header_rwsem);
3202
3203 return ret;
3204}
3205
Alex Elder9d475de2012-07-03 16:01:19 -05003206/*
Alex Elder35938152012-08-02 11:29:46 -05003207 * Scan the rbd device's current snapshot list and compare it to the
3208 * newly-received snapshot context. Remove any existing snapshots
3209 * not present in the new snapshot context. Add a new snapshot for
3210 * any snaphots in the snapshot context not in the current list.
3211 * And verify there are no changes to snapshots we already know
3212 * about.
3213 *
3214 * Assumes the snapshots in the snapshot context are sorted by
3215 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3216 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003217 */
Alex Elder304f6802012-08-31 17:29:52 -05003218static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003219{
Alex Elder35938152012-08-02 11:29:46 -05003220 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3221 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05003222 struct list_head *head = &rbd_dev->snaps;
3223 struct list_head *links = head->next;
3224 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003225
Alex Elder9fcbb802012-08-23 23:48:49 -05003226 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05003227 while (index < snap_count || links != head) {
3228 u64 snap_id;
3229 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05003230 char *snap_name;
3231 u64 snap_size = 0;
3232 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003233
Alex Elder35938152012-08-02 11:29:46 -05003234 snap_id = index < snap_count ? snapc->snaps[index]
3235 : CEPH_NOSNAP;
3236 snap = links != head ? list_entry(links, struct rbd_snap, node)
3237 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05003238 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003239
Alex Elder35938152012-08-02 11:29:46 -05003240 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3241 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003242
Alex Elder35938152012-08-02 11:29:46 -05003243 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003244
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003245 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06003246 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05003247 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05003248 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003249 rbd_dev->spec->snap_id == snap->id ?
3250 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05003251 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003252
Alex Elder35938152012-08-02 11:29:46 -05003253 /* Done with this list entry; advance */
3254
3255 links = next;
3256 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003257 }
Alex Elder35938152012-08-02 11:29:46 -05003258
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003259 snap_name = rbd_dev_snap_info(rbd_dev, index,
3260 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05003261 if (IS_ERR(snap_name))
3262 return PTR_ERR(snap_name);
3263
Alex Elder9fcbb802012-08-23 23:48:49 -05003264 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3265 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05003266 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3267 struct rbd_snap *new_snap;
3268
3269 /* We haven't seen this snapshot before */
3270
Alex Elderc8d18422012-07-10 20:30:11 -05003271 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05003272 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05003273 if (IS_ERR(new_snap)) {
3274 int err = PTR_ERR(new_snap);
3275
3276 dout(" failed to add dev, error %d\n", err);
3277
3278 return err;
3279 }
Alex Elder35938152012-08-02 11:29:46 -05003280
3281 /* New goes before existing, or at end of list */
3282
Alex Elder9fcbb802012-08-23 23:48:49 -05003283 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05003284 if (snap)
3285 list_add_tail(&new_snap->node, &snap->node);
3286 else
Alex Elder523f3252012-08-30 00:16:37 -05003287 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05003288 } else {
3289 /* Already have this one */
3290
Alex Elder9fcbb802012-08-23 23:48:49 -05003291 dout(" already present\n");
3292
Alex Eldercd892122012-07-03 16:01:19 -05003293 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05003294 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05003295 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05003296
3297 /* Done with this list entry; advance */
3298
3299 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003300 }
Alex Elder35938152012-08-02 11:29:46 -05003301
3302 /* Advance to the next entry in the snapshot context */
3303
3304 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003305 }
Alex Elder9fcbb802012-08-23 23:48:49 -05003306 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003307
3308 return 0;
3309}
3310
Alex Elder304f6802012-08-31 17:29:52 -05003311/*
3312 * Scan the list of snapshots and register the devices for any that
3313 * have not already been registered.
3314 */
3315static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3316{
3317 struct rbd_snap *snap;
3318 int ret = 0;
3319
3320 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05003321 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3322 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05003323
3324 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3325 if (!rbd_snap_registered(snap)) {
3326 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3327 if (ret < 0)
3328 break;
3329 }
3330 }
3331 dout("%s: returning %d\n", __func__, ret);
3332
3333 return ret;
3334}
3335
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003336static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3337{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003338 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003339 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003340
3341 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003342
Alex Eldercd789ab2012-08-30 00:16:38 -05003343 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003344 dev->bus = &rbd_bus_type;
3345 dev->type = &rbd_device_type;
3346 dev->parent = &rbd_root_dev;
3347 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003348 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003349 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003350
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003351 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003352
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003353 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003354}
3355
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003356static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3357{
3358 device_unregister(&rbd_dev->dev);
3359}
3360
Alex Eldere2839302012-08-29 17:11:06 -05003361static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003362
3363/*
Alex Elder499afd52012-02-02 08:13:29 -06003364 * Get a unique rbd identifier for the given new rbd_dev, and add
3365 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003366 */
Alex Eldere2839302012-08-29 17:11:06 -05003367static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003368{
Alex Eldere2839302012-08-29 17:11:06 -05003369 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003370
3371 spin_lock(&rbd_dev_list_lock);
3372 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3373 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003374 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3375 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003376}
Alex Elderb7f23c32012-01-29 13:57:43 -06003377
Alex Elder1ddbe942012-01-29 13:57:44 -06003378/*
Alex Elder499afd52012-02-02 08:13:29 -06003379 * Remove an rbd_dev from the global list, and record that its
3380 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003381 */
Alex Eldere2839302012-08-29 17:11:06 -05003382static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003383{
Alex Elderd184f6b2012-01-29 13:57:44 -06003384 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003385 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003386 int max_id;
3387
Alex Elderaafb2302012-09-06 16:00:54 -05003388 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003389
Alex Eldere2839302012-08-29 17:11:06 -05003390 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3391 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003392 spin_lock(&rbd_dev_list_lock);
3393 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003394
3395 /*
3396 * If the id being "put" is not the current maximum, there
3397 * is nothing special we need to do.
3398 */
Alex Eldere2839302012-08-29 17:11:06 -05003399 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003400 spin_unlock(&rbd_dev_list_lock);
3401 return;
3402 }
3403
3404 /*
3405 * We need to update the current maximum id. Search the
3406 * list to find out what it is. We're more likely to find
3407 * the maximum at the end, so search the list backward.
3408 */
3409 max_id = 0;
3410 list_for_each_prev(tmp, &rbd_dev_list) {
3411 struct rbd_device *rbd_dev;
3412
3413 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003414 if (rbd_dev->dev_id > max_id)
3415 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003416 }
Alex Elder499afd52012-02-02 08:13:29 -06003417 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003418
Alex Elder1ddbe942012-01-29 13:57:44 -06003419 /*
Alex Eldere2839302012-08-29 17:11:06 -05003420 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003421 * which case it now accurately reflects the new maximum.
3422 * Be careful not to overwrite the maximum value in that
3423 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003424 */
Alex Eldere2839302012-08-29 17:11:06 -05003425 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3426 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003427}
3428
Alex Eldera725f65e2012-02-02 08:13:30 -06003429/*
Alex Eldere28fff262012-02-02 08:13:30 -06003430 * Skips over white space at *buf, and updates *buf to point to the
3431 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003432 * the token (string of non-white space characters) found. Note
3433 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003434 */
3435static inline size_t next_token(const char **buf)
3436{
3437 /*
3438 * These are the characters that produce nonzero for
3439 * isspace() in the "C" and "POSIX" locales.
3440 */
3441 const char *spaces = " \f\n\r\t\v";
3442
3443 *buf += strspn(*buf, spaces); /* Find start of token */
3444
3445 return strcspn(*buf, spaces); /* Return token length */
3446}
3447
3448/*
3449 * Finds the next token in *buf, and if the provided token buffer is
3450 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003451 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3452 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003453 *
3454 * Returns the length of the token found (not including the '\0').
3455 * Return value will be 0 if no token is found, and it will be >=
3456 * token_size if the token would not fit.
3457 *
Alex Elder593a9e72012-02-07 12:03:37 -06003458 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003459 * found token. Note that this occurs even if the token buffer is
3460 * too small to hold it.
3461 */
3462static inline size_t copy_token(const char **buf,
3463 char *token,
3464 size_t token_size)
3465{
3466 size_t len;
3467
3468 len = next_token(buf);
3469 if (len < token_size) {
3470 memcpy(token, *buf, len);
3471 *(token + len) = '\0';
3472 }
3473 *buf += len;
3474
3475 return len;
3476}
3477
3478/*
Alex Elderea3352f2012-07-09 21:04:23 -05003479 * Finds the next token in *buf, dynamically allocates a buffer big
3480 * enough to hold a copy of it, and copies the token into the new
3481 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3482 * that a duplicate buffer is created even for a zero-length token.
3483 *
3484 * Returns a pointer to the newly-allocated duplicate, or a null
3485 * pointer if memory for the duplicate was not available. If
3486 * the lenp argument is a non-null pointer, the length of the token
3487 * (not including the '\0') is returned in *lenp.
3488 *
3489 * If successful, the *buf pointer will be updated to point beyond
3490 * the end of the found token.
3491 *
3492 * Note: uses GFP_KERNEL for allocation.
3493 */
3494static inline char *dup_token(const char **buf, size_t *lenp)
3495{
3496 char *dup;
3497 size_t len;
3498
3499 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003500 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003501 if (!dup)
3502 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003503 *(dup + len) = '\0';
3504 *buf += len;
3505
3506 if (lenp)
3507 *lenp = len;
3508
3509 return dup;
3510}
3511
3512/*
Alex Elder859c31d2012-10-25 23:34:42 -05003513 * Parse the options provided for an "rbd add" (i.e., rbd image
3514 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3515 * and the data written is passed here via a NUL-terminated buffer.
3516 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003517 *
Alex Elder859c31d2012-10-25 23:34:42 -05003518 * The information extracted from these options is recorded in
3519 * the other parameters which return dynamically-allocated
3520 * structures:
3521 * ceph_opts
3522 * The address of a pointer that will refer to a ceph options
3523 * structure. Caller must release the returned pointer using
3524 * ceph_destroy_options() when it is no longer needed.
3525 * rbd_opts
3526 * Address of an rbd options pointer. Fully initialized by
3527 * this function; caller must release with kfree().
3528 * spec
3529 * Address of an rbd image specification pointer. Fully
3530 * initialized by this function based on parsed options.
3531 * Caller must release with rbd_spec_put().
3532 *
3533 * The options passed take this form:
3534 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3535 * where:
3536 * <mon_addrs>
3537 * A comma-separated list of one or more monitor addresses.
3538 * A monitor address is an ip address, optionally followed
3539 * by a port number (separated by a colon).
3540 * I.e.: ip1[:port1][,ip2[:port2]...]
3541 * <options>
3542 * A comma-separated list of ceph and/or rbd options.
3543 * <pool_name>
3544 * The name of the rados pool containing the rbd image.
3545 * <image_name>
3546 * The name of the image in that pool to map.
3547 * <snap_id>
3548 * An optional snapshot id. If provided, the mapping will
3549 * present data from the image at the time that snapshot was
3550 * created. The image head is used if no snapshot id is
3551 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003552 */
Alex Elder859c31d2012-10-25 23:34:42 -05003553static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003554 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003555 struct rbd_options **opts,
3556 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003557{
Alex Elderd22f76e2012-07-12 10:46:35 -05003558 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003559 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003560 const char *mon_addrs;
3561 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003562 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003563 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003564 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003565 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003566
3567 /* The first four tokens are required */
3568
Alex Elder7ef32142012-02-02 08:13:30 -06003569 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05003570 if (!len) {
3571 rbd_warn(NULL, "no monitor address(es) provided");
3572 return -EINVAL;
3573 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003574 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003575 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003576 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003577
Alex Elderdc79b112012-10-25 23:34:41 -05003578 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003579 options = dup_token(&buf, NULL);
3580 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003581 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003582 if (!*options) {
3583 rbd_warn(NULL, "no options provided");
3584 goto out_err;
3585 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003586
Alex Elder859c31d2012-10-25 23:34:42 -05003587 spec = rbd_spec_alloc();
3588 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003589 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003590
3591 spec->pool_name = dup_token(&buf, NULL);
3592 if (!spec->pool_name)
3593 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003594 if (!*spec->pool_name) {
3595 rbd_warn(NULL, "no pool name provided");
3596 goto out_err;
3597 }
Alex Eldere28fff262012-02-02 08:13:30 -06003598
Alex Elder69e7a022012-11-01 08:39:26 -05003599 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003600 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003601 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003602 if (!*spec->image_name) {
3603 rbd_warn(NULL, "no image name provided");
3604 goto out_err;
3605 }
Alex Eldere28fff262012-02-02 08:13:30 -06003606
Alex Elderf28e5652012-10-25 23:34:41 -05003607 /*
3608 * Snapshot name is optional; default is to use "-"
3609 * (indicating the head/no snapshot).
3610 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003611 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003612 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003613 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3614 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003615 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003616 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003617 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003618 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003619 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003620 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003621 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003622 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003623
Alex Elder0ddebc02012-10-25 23:34:41 -05003624 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003625
Alex Elder4e9afeb2012-10-25 23:34:41 -05003626 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3627 if (!rbd_opts)
3628 goto out_mem;
3629
3630 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003631
Alex Elder859c31d2012-10-25 23:34:42 -05003632 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003633 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003634 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003635 if (IS_ERR(copts)) {
3636 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003637 goto out_err;
3638 }
Alex Elder859c31d2012-10-25 23:34:42 -05003639 kfree(options);
3640
3641 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003642 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003643 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003644
Alex Elderdc79b112012-10-25 23:34:41 -05003645 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003646out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003647 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003648out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003649 kfree(rbd_opts);
3650 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003651 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003652
Alex Elderdc79b112012-10-25 23:34:41 -05003653 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003654}
3655
Alex Elder589d30e2012-07-10 20:30:11 -05003656/*
3657 * An rbd format 2 image has a unique identifier, distinct from the
3658 * name given to it by the user. Internally, that identifier is
3659 * what's used to specify the names of objects related to the image.
3660 *
3661 * A special "rbd id" object is used to map an rbd image name to its
3662 * id. If that object doesn't exist, then there is no v2 rbd image
3663 * with the supplied name.
3664 *
3665 * This function will record the given rbd_dev's image_id field if
3666 * it can be determined, and in that case will return 0. If any
3667 * errors occur a negative errno will be returned and the rbd_dev's
3668 * image_id field will be unchanged (and should be NULL).
3669 */
3670static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3671{
3672 int ret;
3673 size_t size;
3674 char *object_name;
3675 void *response;
3676 void *p;
3677
3678 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003679 * When probing a parent image, the image id is already
3680 * known (and the image name likely is not). There's no
3681 * need to fetch the image id again in this case.
3682 */
3683 if (rbd_dev->spec->image_id)
3684 return 0;
3685
3686 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003687 * First, see if the format 2 image id file exists, and if
3688 * so, get the image's persistent id from it.
3689 */
Alex Elder69e7a022012-11-01 08:39:26 -05003690 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003691 object_name = kmalloc(size, GFP_NOIO);
3692 if (!object_name)
3693 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003694 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003695 dout("rbd id object name is %s\n", object_name);
3696
3697 /* Response will be an encoded string, which includes a length */
3698
3699 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3700 response = kzalloc(size, GFP_NOIO);
3701 if (!response) {
3702 ret = -ENOMEM;
3703 goto out;
3704 }
3705
3706 ret = rbd_req_sync_exec(rbd_dev, object_name,
3707 "rbd", "get_id",
3708 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003709 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003710 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3711 if (ret < 0)
3712 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003713 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003714
3715 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003716 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003717 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003718 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003719 if (IS_ERR(rbd_dev->spec->image_id)) {
3720 ret = PTR_ERR(rbd_dev->spec->image_id);
3721 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003722 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003723 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003724 }
3725out:
3726 kfree(response);
3727 kfree(object_name);
3728
3729 return ret;
3730}
3731
Alex Eldera30b71b2012-07-10 20:30:11 -05003732static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3733{
3734 int ret;
3735 size_t size;
3736
3737 /* Version 1 images have no id; empty string is used */
3738
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003739 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3740 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003741 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003742
3743 /* Record the header object name for this rbd image. */
3744
Alex Elder69e7a022012-11-01 08:39:26 -05003745 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003746 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3747 if (!rbd_dev->header_name) {
3748 ret = -ENOMEM;
3749 goto out_err;
3750 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003751 sprintf(rbd_dev->header_name, "%s%s",
3752 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003753
3754 /* Populate rbd image metadata */
3755
3756 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3757 if (ret < 0)
3758 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003759
3760 /* Version 1 images have no parent (no layering) */
3761
3762 rbd_dev->parent_spec = NULL;
3763 rbd_dev->parent_overlap = 0;
3764
Alex Eldera30b71b2012-07-10 20:30:11 -05003765 rbd_dev->image_format = 1;
3766
3767 dout("discovered version 1 image, header name is %s\n",
3768 rbd_dev->header_name);
3769
3770 return 0;
3771
3772out_err:
3773 kfree(rbd_dev->header_name);
3774 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003775 kfree(rbd_dev->spec->image_id);
3776 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003777
3778 return ret;
3779}
3780
3781static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3782{
3783 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003784 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003785 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003786
3787 /*
3788 * Image id was filled in by the caller. Record the header
3789 * object name for this rbd image.
3790 */
Alex Elder979ed482012-11-01 08:39:26 -05003791 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003792 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3793 if (!rbd_dev->header_name)
3794 return -ENOMEM;
3795 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003796 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003797
3798 /* Get the size and object order for the image */
3799
3800 ret = rbd_dev_v2_image_size(rbd_dev);
3801 if (ret < 0)
3802 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003803
3804 /* Get the object prefix (a.k.a. block_name) for the image */
3805
3806 ret = rbd_dev_v2_object_prefix(rbd_dev);
3807 if (ret < 0)
3808 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003809
Alex Elderd8891402012-10-09 13:50:17 -07003810 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003811
3812 ret = rbd_dev_v2_features(rbd_dev);
3813 if (ret < 0)
3814 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003815
Alex Elder86b00e02012-10-25 23:34:42 -05003816 /* If the image supports layering, get the parent info */
3817
3818 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3819 ret = rbd_dev_v2_parent_info(rbd_dev);
3820 if (ret < 0)
3821 goto out_err;
3822 }
3823
Alex Elder6e14b1a2012-07-03 16:01:19 -05003824 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003825
Alex Elder6e14b1a2012-07-03 16:01:19 -05003826 rbd_dev->header.crypt_type = 0;
3827 rbd_dev->header.comp_type = 0;
3828
3829 /* Get the snapshot context, plus the header version */
3830
3831 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003832 if (ret)
3833 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003834 rbd_dev->header.obj_version = ver;
3835
Alex Eldera30b71b2012-07-10 20:30:11 -05003836 rbd_dev->image_format = 2;
3837
3838 dout("discovered version 2 image, header name is %s\n",
3839 rbd_dev->header_name);
3840
Alex Elder35152972012-08-31 17:29:55 -05003841 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003842out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003843 rbd_dev->parent_overlap = 0;
3844 rbd_spec_put(rbd_dev->parent_spec);
3845 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003846 kfree(rbd_dev->header_name);
3847 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003848 kfree(rbd_dev->header.object_prefix);
3849 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003850
3851 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003852}
3853
Alex Elder83a06262012-10-30 15:47:17 -05003854static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3855{
3856 int ret;
3857
3858 /* no need to lock here, as rbd_dev is not registered yet */
3859 ret = rbd_dev_snaps_update(rbd_dev);
3860 if (ret)
3861 return ret;
3862
Alex Elder9e15b772012-10-30 19:40:33 -05003863 ret = rbd_dev_probe_update_spec(rbd_dev);
3864 if (ret)
3865 goto err_out_snaps;
3866
Alex Elder83a06262012-10-30 15:47:17 -05003867 ret = rbd_dev_set_mapping(rbd_dev);
3868 if (ret)
3869 goto err_out_snaps;
3870
3871 /* generate unique id: find highest unique id, add one */
3872 rbd_dev_id_get(rbd_dev);
3873
3874 /* Fill in the device name, now that we have its id. */
3875 BUILD_BUG_ON(DEV_NAME_LEN
3876 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3877 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3878
3879 /* Get our block major device number. */
3880
3881 ret = register_blkdev(0, rbd_dev->name);
3882 if (ret < 0)
3883 goto err_out_id;
3884 rbd_dev->major = ret;
3885
3886 /* Set up the blkdev mapping. */
3887
3888 ret = rbd_init_disk(rbd_dev);
3889 if (ret)
3890 goto err_out_blkdev;
3891
3892 ret = rbd_bus_add_dev(rbd_dev);
3893 if (ret)
3894 goto err_out_disk;
3895
3896 /*
3897 * At this point cleanup in the event of an error is the job
3898 * of the sysfs code (initiated by rbd_bus_del_dev()).
3899 */
3900 down_write(&rbd_dev->header_rwsem);
3901 ret = rbd_dev_snaps_register(rbd_dev);
3902 up_write(&rbd_dev->header_rwsem);
3903 if (ret)
3904 goto err_out_bus;
3905
Alex Elderc0430642013-01-18 12:31:09 -06003906 ret = rbd_req_sync_watch(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05003907 if (ret)
3908 goto err_out_bus;
3909
3910 /* Everything's ready. Announce the disk to the world. */
3911
3912 add_disk(rbd_dev->disk);
3913
3914 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3915 (unsigned long long) rbd_dev->mapping.size);
3916
3917 return ret;
3918err_out_bus:
3919 /* this will also clean up rest of rbd_dev stuff */
3920
3921 rbd_bus_del_dev(rbd_dev);
3922
3923 return ret;
3924err_out_disk:
3925 rbd_free_disk(rbd_dev);
3926err_out_blkdev:
3927 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3928err_out_id:
3929 rbd_dev_id_put(rbd_dev);
3930err_out_snaps:
3931 rbd_remove_all_snaps(rbd_dev);
3932
3933 return ret;
3934}
3935
Alex Eldera30b71b2012-07-10 20:30:11 -05003936/*
3937 * Probe for the existence of the header object for the given rbd
3938 * device. For format 2 images this includes determining the image
3939 * id.
3940 */
3941static int rbd_dev_probe(struct rbd_device *rbd_dev)
3942{
3943 int ret;
3944
3945 /*
3946 * Get the id from the image id object. If it's not a
3947 * format 2 image, we'll get ENOENT back, and we'll assume
3948 * it's a format 1 image.
3949 */
3950 ret = rbd_dev_image_id(rbd_dev);
3951 if (ret)
3952 ret = rbd_dev_v1_probe(rbd_dev);
3953 else
3954 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003955 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003956 dout("probe failed, returning %d\n", ret);
3957
Alex Elder83a06262012-10-30 15:47:17 -05003958 return ret;
3959 }
3960
3961 ret = rbd_dev_probe_finish(rbd_dev);
3962 if (ret)
3963 rbd_header_free(&rbd_dev->header);
3964
Alex Eldera30b71b2012-07-10 20:30:11 -05003965 return ret;
3966}
3967
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003968static ssize_t rbd_add(struct bus_type *bus,
3969 const char *buf,
3970 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003971{
Alex Eldercb8627c2012-07-09 21:04:23 -05003972 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003973 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003974 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003975 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003976 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003977 struct ceph_osd_client *osdc;
3978 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003979
3980 if (!try_module_get(THIS_MODULE))
3981 return -ENODEV;
3982
Alex Eldera725f65e2012-02-02 08:13:30 -06003983 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003984 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003985 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003986 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003987
Alex Elder9d3997f2012-10-25 23:34:42 -05003988 rbdc = rbd_get_client(ceph_opts);
3989 if (IS_ERR(rbdc)) {
3990 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003991 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003992 }
Alex Elderc53d5892012-10-25 23:34:42 -05003993 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003994
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003995 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003996 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003997 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003998 if (rc < 0)
3999 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004000 spec->pool_id = (u64) rc;
4001
Alex Elder0903e872012-11-14 12:25:19 -06004002 /* The ceph file layout needs to fit pool id in 32 bits */
4003
4004 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4005 rc = -EIO;
4006 goto err_out_client;
4007 }
4008
Alex Elderc53d5892012-10-25 23:34:42 -05004009 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004010 if (!rbd_dev)
4011 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004012 rbdc = NULL; /* rbd_dev now owns this */
4013 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004014
Alex Elderbd4ba652012-10-25 23:34:42 -05004015 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004016 kfree(rbd_opts);
4017 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004018
Alex Eldera30b71b2012-07-10 20:30:11 -05004019 rc = rbd_dev_probe(rbd_dev);
4020 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004021 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004022
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004023 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004024err_out_rbd_dev:
4025 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004026err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004027 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004028err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004029 if (ceph_opts)
4030 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004031 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004032 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004033err_out_module:
4034 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004035
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004036 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004037
4038 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004039}
4040
Alex Elderde71a292012-07-03 16:01:19 -05004041static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004042{
4043 struct list_head *tmp;
4044 struct rbd_device *rbd_dev;
4045
Alex Eldere124a82f2012-01-29 13:57:44 -06004046 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004047 list_for_each(tmp, &rbd_dev_list) {
4048 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004049 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06004050 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004051 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06004052 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004053 }
Alex Eldere124a82f2012-01-29 13:57:44 -06004054 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004055 return NULL;
4056}
4057
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004058static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004059{
Alex Elder593a9e72012-02-07 12:03:37 -06004060 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004061
Alex Elder1dbb4392012-01-24 10:08:37 -06004062 if (rbd_dev->watch_request) {
4063 struct ceph_client *client = rbd_dev->rbd_client->client;
4064
4065 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004066 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06004067 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004068 if (rbd_dev->watch_event)
Alex Elder907703d2012-11-13 21:11:15 -06004069 rbd_req_sync_watch(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004070
4071 /* clean up and free blkdev */
4072 rbd_free_disk(rbd_dev);
4073 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004074
Alex Elder2ac4e752012-07-10 20:30:10 -05004075 /* release allocated disk header fields */
4076 rbd_header_free(&rbd_dev->header);
4077
Alex Elder32eec682012-02-08 16:11:14 -06004078 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004079 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004080 rbd_assert(rbd_dev->rbd_client != NULL);
4081 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004082
4083 /* release module ref */
4084 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004085}
4086
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004087static ssize_t rbd_remove(struct bus_type *bus,
4088 const char *buf,
4089 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004090{
4091 struct rbd_device *rbd_dev = NULL;
4092 int target_id, rc;
4093 unsigned long ul;
4094 int ret = count;
4095
4096 rc = strict_strtoul(buf, 10, &ul);
4097 if (rc)
4098 return rc;
4099
4100 /* convert to int; abort if we lost anything in the conversion */
4101 target_id = (int) ul;
4102 if (target_id != ul)
4103 return -EINVAL;
4104
4105 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4106
4107 rbd_dev = __rbd_get_dev(target_id);
4108 if (!rbd_dev) {
4109 ret = -ENOENT;
4110 goto done;
4111 }
4112
Alex Elder42382b72012-11-16 09:29:16 -06004113 if (rbd_dev->open_count) {
4114 ret = -EBUSY;
4115 goto done;
4116 }
4117
Alex Elder41f38c22012-10-25 23:34:40 -05004118 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004119 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004120
4121done:
4122 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05004123
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004124 return ret;
4125}
4126
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004127/*
4128 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004129 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004130 */
4131static int rbd_sysfs_init(void)
4132{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004133 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004134
Alex Elderfed4c142012-02-07 12:03:36 -06004135 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004136 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004137 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004138
Alex Elderfed4c142012-02-07 12:03:36 -06004139 ret = bus_register(&rbd_bus_type);
4140 if (ret < 0)
4141 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004142
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004143 return ret;
4144}
4145
4146static void rbd_sysfs_cleanup(void)
4147{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004148 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06004149 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004150}
4151
4152int __init rbd_init(void)
4153{
4154 int rc;
4155
4156 rc = rbd_sysfs_init();
4157 if (rc)
4158 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06004159 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004160 return 0;
4161}
4162
4163void __exit rbd_exit(void)
4164{
4165 rbd_sysfs_cleanup();
4166}
4167
4168module_init(rbd_init);
4169module_exit(rbd_exit);
4170
4171MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4172MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4173MODULE_DESCRIPTION("rados block device");
4174
4175/* following authorship retained from original osdblk.c */
4176MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4177
4178MODULE_LICENSE("GPL");