blob: ee53d8e528017a539c59f193432aaa49c201b8f1 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderf0f8cef2012-01-29 13:57:44 -060055#define RBD_DRV_NAME "rbd"
56#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070057
58#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
59
Alex Elderd4b125e2012-07-03 16:01:19 -050060#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
61#define RBD_MAX_SNAP_NAME_LEN \
62 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63
Alex Elder35d489f2012-07-03 16:01:19 -050064#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070065
66#define RBD_SNAP_HEAD_NAME "-"
67
Alex Elder9e15b772012-10-30 19:40:33 -050068/* This allows a single page to hold an image name sent by OSD */
69#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050070#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050071
Alex Elder1e130192012-07-03 16:01:19 -050072#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050073
Alex Elderd8891402012-10-09 13:50:17 -070074/* Feature bits */
75
Alex Elder5cbf6f122013-04-11 09:29:48 -050076#define RBD_FEATURE_LAYERING (1<<0)
77#define RBD_FEATURE_STRIPINGV2 (1<<1)
78#define RBD_FEATURES_ALL \
79 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
Alex Elderd8891402012-10-09 13:50:17 -070080
81/* Features supported by this (client software) implementation. */
82
Alex Elder5cbf6f122013-04-11 09:29:48 -050083#define RBD_FEATURES_SUPPORTED (0)
Alex Elderd8891402012-10-09 13:50:17 -070084
Alex Elder81a89792012-02-02 08:13:30 -060085/*
86 * An RBD device name will be "rbd#", where the "rbd" comes from
87 * RBD_DRV_NAME above, and # is a unique integer identifier.
88 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
89 * enough to hold all possible device names.
90 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060092#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093
94/*
95 * block device image metadata (in-memory version)
96 */
97struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050098 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050099 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500100 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101 __u8 obj_order;
102 __u8 crypt_type;
103 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104
Alex Elderf84344f2012-08-31 17:29:51 -0500105 /* The remaining fields need to be updated occasionally */
106 u64 image_size;
107 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108 char *snap_names;
109 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700110
111 u64 obj_version;
112};
113
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500114/*
115 * An rbd image specification.
116 *
117 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500118 * identify an image. Each rbd_dev structure includes a pointer to
119 * an rbd_spec structure that encapsulates this identity.
120 *
121 * Each of the id's in an rbd_spec has an associated name. For a
122 * user-mapped image, the names are supplied and the id's associated
123 * with them are looked up. For a layered image, a parent image is
124 * defined by the tuple, and the names are looked up.
125 *
126 * An rbd_dev structure contains a parent_spec pointer which is
127 * non-null if the image it represents is a child in a layered
128 * image. This pointer will refer to the rbd_spec structure used
129 * by the parent rbd_dev for its own identity (i.e., the structure
130 * is shared between the parent and child).
131 *
132 * Since these structures are populated once, during the discovery
133 * phase of image construction, they are effectively immutable so
134 * we make no effort to synchronize access to them.
135 *
136 * Note that code herein does not assume the image name is known (it
137 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500138 */
139struct rbd_spec {
140 u64 pool_id;
141 char *pool_name;
142
143 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500144 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500145
146 u64 snap_id;
147 char *snap_name;
148
149 struct kref kref;
150};
151
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700152/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600153 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154 */
155struct rbd_client {
156 struct ceph_client *client;
157 struct kref kref;
158 struct list_head node;
159};
160
Alex Elderbf0d5f502012-11-22 00:00:08 -0600161struct rbd_img_request;
162typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163
164#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
165
166struct rbd_obj_request;
167typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168
Alex Elder9969ebc2013-01-18 12:31:10 -0600169enum obj_request_type {
170 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
171};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600172
Alex Elder926f9b32013-02-11 12:33:24 -0600173enum obj_req_flags {
174 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600175 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600176 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
177 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600178};
179
Alex Elderbf0d5f502012-11-22 00:00:08 -0600180struct rbd_obj_request {
181 const char *object_name;
182 u64 offset; /* object start byte */
183 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600184 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600185
Alex Elderc5b5ef62013-02-11 12:33:24 -0600186 /*
187 * An object request associated with an image will have its
188 * img_data flag set; a standalone object request will not.
189 *
190 * A standalone object request will have which == BAD_WHICH
191 * and a null obj_request pointer.
192 *
193 * An object request initiated in support of a layered image
194 * object (to check for its existence before a write) will
195 * have which == BAD_WHICH and a non-null obj_request pointer.
196 *
197 * Finally, an object request for rbd image data will have
198 * which != BAD_WHICH, and will have a non-null img_request
199 * pointer. The value of which will be in the range
200 * 0..(img_request->obj_request_count-1).
201 */
202 union {
203 struct rbd_obj_request *obj_request; /* STAT op */
204 struct {
205 struct rbd_img_request *img_request;
206 u64 img_offset;
207 /* links for img_request->obj_requests list */
208 struct list_head links;
209 };
210 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211 u32 which; /* posn image request list */
212
213 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600214 union {
215 struct bio *bio_list;
216 struct {
217 struct page **pages;
218 u32 page_count;
219 };
220 };
Alex Elder0eefd472013-04-19 15:34:50 -0500221 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600222
223 struct ceph_osd_request *osd_req;
224
225 u64 xferred; /* bytes transferred */
226 u64 version;
Sage Weil1b83bef2013-02-25 16:11:12 -0800227 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600228
229 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600230 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600231
232 struct kref kref;
233};
234
Alex Elder0c425242013-02-08 09:55:49 -0600235enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600236 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
237 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600238 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600239};
240
Alex Elderbf0d5f502012-11-22 00:00:08 -0600241struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600242 struct rbd_device *rbd_dev;
243 u64 offset; /* starting image byte offset */
244 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600245 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600246 union {
Alex Elder9849e982013-01-24 16:13:36 -0600247 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600248 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600249 };
250 union {
251 struct request *rq; /* block request */
252 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600253 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500254 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255 spinlock_t completion_lock;/* protects next_completion */
256 u32 next_completion;
257 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500258 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600259 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600260
261 u32 obj_request_count;
262 struct list_head obj_requests; /* rbd_obj_request structs */
263
264 struct kref kref;
265};
266
267#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600268 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600269#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600270 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600271#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600272 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600273
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800274struct rbd_snap {
275 struct device dev;
276 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800277 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800278 struct list_head node;
279 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500280 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800281};
282
Alex Elderf84344f2012-08-31 17:29:51 -0500283struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500284 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500285 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500286 bool read_only;
287};
288
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700289/*
290 * a single device
291 */
292struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500293 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294
295 int major; /* blkdev assigned major */
296 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
Alex Eldera30b71b2012-07-10 20:30:11 -0500298 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299 struct rbd_client *rbd_client;
300
301 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
302
Alex Elderb82d1672013-01-14 12:43:31 -0600303 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304
305 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600306 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500307 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700308
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500309 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500310
Alex Elder0903e872012-11-14 12:25:19 -0600311 struct ceph_file_layout layout;
312
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700313 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600314 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700315
Alex Elder86b00e02012-10-25 23:34:42 -0500316 struct rbd_spec *parent_spec;
317 u64 parent_overlap;
Alex Elder2f82ee52012-10-30 19:40:33 -0500318 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500319
Josh Durginc6666012011-11-21 17:11:12 -0800320 /* protects updating the header */
321 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500322
323 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700324
325 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800326
327 /* list of snapshots */
328 struct list_head snaps;
329
330 /* sysfs related */
331 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600332 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800333};
334
Alex Elderb82d1672013-01-14 12:43:31 -0600335/*
336 * Flag bits for rbd_dev->flags. If atomicity is required,
337 * rbd_dev->lock is used to protect access.
338 *
339 * Currently, only the "removing" flag (which is coupled with the
340 * "open_count" field) requires atomic access.
341 */
Alex Elder6d292902013-01-14 12:43:31 -0600342enum rbd_dev_flags {
343 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600344 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600345};
346
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600348
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600350static DEFINE_SPINLOCK(rbd_dev_list_lock);
351
Alex Elder432b8582012-01-29 13:57:44 -0600352static LIST_HEAD(rbd_client_list); /* clients */
353static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354
Alex Elder3d7efd12013-04-19 15:34:50 -0500355static int rbd_img_request_submit(struct rbd_img_request *img_request);
356
Alex Elder304f6802012-08-31 17:29:52 -0500357static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
358static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
359
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800360static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500361static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800362
Alex Elderf0f8cef2012-01-29 13:57:44 -0600363static ssize_t rbd_add(struct bus_type *bus, const char *buf,
364 size_t count);
365static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
366 size_t count);
Alex Elder2f82ee52012-10-30 19:40:33 -0500367static int rbd_dev_probe(struct rbd_device *rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600368
369static struct bus_attribute rbd_bus_attrs[] = {
370 __ATTR(add, S_IWUSR, NULL, rbd_add),
371 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
372 __ATTR_NULL
373};
374
375static struct bus_type rbd_bus_type = {
376 .name = "rbd",
377 .bus_attrs = rbd_bus_attrs,
378};
379
380static void rbd_root_dev_release(struct device *dev)
381{
382}
383
384static struct device rbd_root_dev = {
385 .init_name = "rbd",
386 .release = rbd_root_dev_release,
387};
388
Alex Elder06ecc6c2012-11-01 10:17:15 -0500389static __printf(2, 3)
390void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
391{
392 struct va_format vaf;
393 va_list args;
394
395 va_start(args, fmt);
396 vaf.fmt = fmt;
397 vaf.va = &args;
398
399 if (!rbd_dev)
400 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
401 else if (rbd_dev->disk)
402 printk(KERN_WARNING "%s: %s: %pV\n",
403 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
404 else if (rbd_dev->spec && rbd_dev->spec->image_name)
405 printk(KERN_WARNING "%s: image %s: %pV\n",
406 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
407 else if (rbd_dev->spec && rbd_dev->spec->image_id)
408 printk(KERN_WARNING "%s: id %s: %pV\n",
409 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
410 else /* punt */
411 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
412 RBD_DRV_NAME, rbd_dev, &vaf);
413 va_end(args);
414}
415
Alex Elderaafb2302012-09-06 16:00:54 -0500416#ifdef RBD_DEBUG
417#define rbd_assert(expr) \
418 if (unlikely(!(expr))) { \
419 printk(KERN_ERR "\nAssertion failure in %s() " \
420 "at line %d:\n\n" \
421 "\trbd_assert(%s);\n\n", \
422 __func__, __LINE__, #expr); \
423 BUG(); \
424 }
425#else /* !RBD_DEBUG */
426# define rbd_assert(expr) ((void) 0)
427#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800428
Alex Elder8b3e1a52013-01-24 16:13:36 -0600429static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
Alex Elderb454e362013-04-19 15:34:50 -0500430static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600431
Alex Elder117973f2012-08-31 17:29:55 -0500432static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
433static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435static int rbd_open(struct block_device *bdev, fmode_t mode)
436{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600437 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600438 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439
Alex Elderf84344f2012-08-31 17:29:51 -0500440 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 return -EROFS;
442
Alex Eldera14ea262013-02-05 13:23:12 -0600443 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600444 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
445 removing = true;
446 else
447 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600448 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600449 if (removing)
450 return -ENOENT;
451
Alex Elder42382b72012-11-16 09:29:16 -0600452 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600453 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500454 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600455 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700456
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 return 0;
458}
459
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800460static int rbd_release(struct gendisk *disk, fmode_t mode)
461{
462 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600463 unsigned long open_count_before;
464
Alex Eldera14ea262013-02-05 13:23:12 -0600465 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600466 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600467 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600468 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800469
Alex Elder42382b72012-11-16 09:29:16 -0600470 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600471 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600472 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800473
474 return 0;
475}
476
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700477static const struct block_device_operations rbd_bd_ops = {
478 .owner = THIS_MODULE,
479 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800480 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481};
482
483/*
484 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500485 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486 */
Alex Elderf8c38922012-08-10 13:12:07 -0700487static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488{
489 struct rbd_client *rbdc;
490 int ret = -ENOMEM;
491
Alex Elder37206ee2013-02-20 17:32:08 -0600492 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700493 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
494 if (!rbdc)
495 goto out_opt;
496
497 kref_init(&rbdc->kref);
498 INIT_LIST_HEAD(&rbdc->node);
499
Alex Elderbc534d82012-01-29 13:57:44 -0600500 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
501
Alex Elder43ae4702012-07-03 16:01:18 -0500502 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600504 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500505 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700506
507 ret = ceph_open_session(rbdc->client);
508 if (ret < 0)
509 goto out_err;
510
Alex Elder432b8582012-01-29 13:57:44 -0600511 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700512 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600513 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514
Alex Elderbc534d82012-01-29 13:57:44 -0600515 mutex_unlock(&ctl_mutex);
Alex Elder37206ee2013-02-20 17:32:08 -0600516 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600517
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700518 return rbdc;
519
520out_err:
521 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600522out_mutex:
523 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700524 kfree(rbdc);
525out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500526 if (ceph_opts)
527 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600528 dout("%s: error %d\n", __func__, ret);
529
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400530 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700531}
532
Alex Elder2f82ee52012-10-30 19:40:33 -0500533static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
534{
535 kref_get(&rbdc->kref);
536
537 return rbdc;
538}
539
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700541 * Find a ceph client with specific addr and configuration. If
542 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700543 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700544static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545{
546 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700547 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700548
Alex Elder43ae4702012-07-03 16:01:18 -0500549 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700550 return NULL;
551
Alex Elder1f7ba332012-08-10 13:12:07 -0700552 spin_lock(&rbd_client_list_lock);
553 list_for_each_entry(client_node, &rbd_client_list, node) {
554 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500555 __rbd_get_client(client_node);
556
Alex Elder1f7ba332012-08-10 13:12:07 -0700557 found = true;
558 break;
559 }
560 }
561 spin_unlock(&rbd_client_list_lock);
562
563 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700564}
565
566/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700567 * mount options
568 */
569enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700570 Opt_last_int,
571 /* int args above */
572 Opt_last_string,
573 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700574 Opt_read_only,
575 Opt_read_write,
576 /* Boolean args above */
577 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700578};
579
Alex Elder43ae4702012-07-03 16:01:18 -0500580static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700581 /* int args above */
582 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500583 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700584 {Opt_read_only, "ro"}, /* Alternate spelling */
585 {Opt_read_write, "read_write"},
586 {Opt_read_write, "rw"}, /* Alternate spelling */
587 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700588 {-1, NULL}
589};
590
Alex Elder98571b52013-01-20 14:44:42 -0600591struct rbd_options {
592 bool read_only;
593};
594
595#define RBD_READ_ONLY_DEFAULT false
596
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700597static int parse_rbd_opts_token(char *c, void *private)
598{
Alex Elder43ae4702012-07-03 16:01:18 -0500599 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700600 substring_t argstr[MAX_OPT_ARGS];
601 int token, intval, ret;
602
Alex Elder43ae4702012-07-03 16:01:18 -0500603 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700604 if (token < 0)
605 return -EINVAL;
606
607 if (token < Opt_last_int) {
608 ret = match_int(&argstr[0], &intval);
609 if (ret < 0) {
610 pr_err("bad mount option arg (not int) "
611 "at '%s'\n", c);
612 return ret;
613 }
614 dout("got int token %d val %d\n", token, intval);
615 } else if (token > Opt_last_int && token < Opt_last_string) {
616 dout("got string token %d val %s\n", token,
617 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700618 } else if (token > Opt_last_string && token < Opt_last_bool) {
619 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700620 } else {
621 dout("got token %d\n", token);
622 }
623
624 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700625 case Opt_read_only:
626 rbd_opts->read_only = true;
627 break;
628 case Opt_read_write:
629 rbd_opts->read_only = false;
630 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700631 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500632 rbd_assert(false);
633 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700634 }
635 return 0;
636}
637
638/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639 * Get a ceph client with specific addr and configuration, if one does
640 * not exist create it.
641 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500642static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643{
Alex Elderf8c38922012-08-10 13:12:07 -0700644 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700645
Alex Elder1f7ba332012-08-10 13:12:07 -0700646 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500647 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500648 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500649 else
Alex Elderf8c38922012-08-10 13:12:07 -0700650 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651
Alex Elder9d3997f2012-10-25 23:34:42 -0500652 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700653}
654
655/*
656 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600657 *
Alex Elder432b8582012-01-29 13:57:44 -0600658 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659 */
660static void rbd_client_release(struct kref *kref)
661{
662 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
663
Alex Elder37206ee2013-02-20 17:32:08 -0600664 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500665 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500667 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668
669 ceph_destroy_client(rbdc->client);
670 kfree(rbdc);
671}
672
673/*
674 * Drop reference to ceph client node. If it's not referenced anymore, release
675 * it.
676 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500677static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678{
Alex Elderc53d5892012-10-25 23:34:42 -0500679 if (rbdc)
680 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681}
682
Alex Eldera30b71b2012-07-10 20:30:11 -0500683static bool rbd_image_format_valid(u32 image_format)
684{
685 return image_format == 1 || image_format == 2;
686}
687
Alex Elder8e94af82012-07-25 09:32:40 -0500688static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
689{
Alex Elder103a1502012-08-02 11:29:45 -0500690 size_t size;
691 u32 snap_count;
692
693 /* The header has to start with the magic rbd header text */
694 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
695 return false;
696
Alex Elderdb2388b2012-10-20 22:17:27 -0500697 /* The bio layer requires at least sector-sized I/O */
698
699 if (ondisk->options.order < SECTOR_SHIFT)
700 return false;
701
702 /* If we use u64 in a few spots we may be able to loosen this */
703
704 if (ondisk->options.order > 8 * sizeof (int) - 1)
705 return false;
706
Alex Elder103a1502012-08-02 11:29:45 -0500707 /*
708 * The size of a snapshot header has to fit in a size_t, and
709 * that limits the number of snapshots.
710 */
711 snap_count = le32_to_cpu(ondisk->snap_count);
712 size = SIZE_MAX - sizeof (struct ceph_snap_context);
713 if (snap_count > size / sizeof (__le64))
714 return false;
715
716 /*
717 * Not only that, but the size of the entire the snapshot
718 * header must also be representable in a size_t.
719 */
720 size -= snap_count * sizeof (__le64);
721 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
722 return false;
723
724 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500725}
726
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727/*
728 * Create a new header structure, translate header format from the on-disk
729 * header.
730 */
731static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500732 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733{
Alex Elderccece232012-07-10 20:30:10 -0500734 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500735 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500736 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500737 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738
Alex Elder6a523252012-07-19 17:12:59 -0500739 memset(header, 0, sizeof (*header));
740
Alex Elder103a1502012-08-02 11:29:45 -0500741 snap_count = le32_to_cpu(ondisk->snap_count);
742
Alex Elder58c17b02012-08-23 23:22:06 -0500743 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
744 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500745 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700746 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500747 memcpy(header->object_prefix, ondisk->object_prefix, len);
748 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600749
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500751 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
752
Alex Elder621901d2012-08-23 23:22:06 -0500753 /* Save a copy of the snapshot names */
754
Alex Elderf785cc12012-08-23 23:22:06 -0500755 if (snap_names_len > (u64) SIZE_MAX)
756 return -EIO;
757 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500759 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500760 /*
761 * Note that rbd_dev_v1_header_read() guarantees
762 * the ondisk buffer we're working with has
763 * snap_names_len bytes beyond the end of the
764 * snapshot id array, this memcpy() is safe.
765 */
766 memcpy(header->snap_names, &ondisk->snaps[snap_count],
767 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500768
Alex Elder621901d2012-08-23 23:22:06 -0500769 /* Record each snapshot's size */
770
Alex Elderd2bb24e2012-07-26 23:37:14 -0500771 size = snap_count * sizeof (*header->snap_sizes);
772 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700773 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500774 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500775 for (i = 0; i < snap_count; i++)
776 header->snap_sizes[i] =
777 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700778 } else {
Alex Elderccece232012-07-10 20:30:10 -0500779 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700780 header->snap_names = NULL;
781 header->snap_sizes = NULL;
782 }
Alex Elder849b4262012-07-09 21:04:24 -0500783
Alex Elder34b13182012-07-13 20:35:12 -0500784 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700785 header->obj_order = ondisk->options.order;
786 header->crypt_type = ondisk->options.crypt_type;
787 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500788
Alex Elder621901d2012-08-23 23:22:06 -0500789 /* Allocate and fill in the snapshot context */
790
Alex Elderf84344f2012-08-31 17:29:51 -0500791 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500792 size = sizeof (struct ceph_snap_context);
793 size += snap_count * sizeof (header->snapc->snaps[0]);
794 header->snapc = kzalloc(size, GFP_KERNEL);
795 if (!header->snapc)
796 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700797
798 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500799 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700800 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500801 for (i = 0; i < snap_count; i++)
802 header->snapc->snaps[i] =
803 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700804
805 return 0;
806
Alex Elder6a523252012-07-19 17:12:59 -0500807out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500808 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500809 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700810 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500811 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500812 kfree(header->object_prefix);
813 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500814
Alex Elder00f1f362012-02-07 12:03:36 -0600815 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700816}
817
Alex Elder9e15b772012-10-30 19:40:33 -0500818static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
819{
820 struct rbd_snap *snap;
821
822 if (snap_id == CEPH_NOSNAP)
823 return RBD_SNAP_HEAD_NAME;
824
825 list_for_each_entry(snap, &rbd_dev->snaps, node)
826 if (snap_id == snap->id)
827 return snap->name;
828
829 return NULL;
830}
831
Alex Elder8836b992012-08-30 14:42:15 -0500832static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700833{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834
Alex Eldere86924a2012-07-10 20:30:11 -0500835 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600836
Alex Eldere86924a2012-07-10 20:30:11 -0500837 list_for_each_entry(snap, &rbd_dev->snaps, node) {
838 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500839 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500840 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500841 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600842
Alex Eldere86924a2012-07-10 20:30:11 -0500843 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600844 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845 }
Alex Eldere86924a2012-07-10 20:30:11 -0500846
Alex Elder00f1f362012-02-07 12:03:36 -0600847 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700848}
849
Alex Elder819d52b2012-10-25 23:34:41 -0500850static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700851{
Alex Elder78dc4472012-07-19 08:49:18 -0500852 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700853
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500854 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800855 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500856 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500857 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500858 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500859 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700860 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500861 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700862 if (ret < 0)
863 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500864 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865 }
Alex Elder6d292902013-01-14 12:43:31 -0600866 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
867
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700868done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700869 return ret;
870}
871
872static void rbd_header_free(struct rbd_image_header *header)
873{
Alex Elder849b4262012-07-09 21:04:24 -0500874 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500875 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500877 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500878 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500879 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800880 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500881 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882}
883
Alex Elder98571b52013-01-20 14:44:42 -0600884static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885{
Alex Elder65ccfe22012-08-09 10:33:26 -0700886 char *name;
887 u64 segment;
888 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889
Alex Elder2fd82b92012-11-09 15:05:54 -0600890 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700891 if (!name)
892 return NULL;
893 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600894 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700895 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600896 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700897 pr_err("error formatting segment name for #%llu (%d)\n",
898 segment, ret);
899 kfree(name);
900 name = NULL;
901 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700902
Alex Elder65ccfe22012-08-09 10:33:26 -0700903 return name;
904}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700905
Alex Elder65ccfe22012-08-09 10:33:26 -0700906static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
907{
908 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700909
Alex Elder65ccfe22012-08-09 10:33:26 -0700910 return offset & (segment_size - 1);
911}
912
913static u64 rbd_segment_length(struct rbd_device *rbd_dev,
914 u64 offset, u64 length)
915{
916 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
917
918 offset &= segment_size - 1;
919
Alex Elderaafb2302012-09-06 16:00:54 -0500920 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700921 if (offset + length > segment_size)
922 length = segment_size - offset;
923
924 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925}
926
927/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700928 * returns the size of an object in the image
929 */
930static u64 rbd_obj_bytes(struct rbd_image_header *header)
931{
932 return 1 << header->obj_order;
933}
934
935/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700936 * bio helpers
937 */
938
939static void bio_chain_put(struct bio *chain)
940{
941 struct bio *tmp;
942
943 while (chain) {
944 tmp = chain;
945 chain = chain->bi_next;
946 bio_put(tmp);
947 }
948}
949
950/*
951 * zeros a bio chain, starting at specific offset
952 */
953static void zero_bio_chain(struct bio *chain, int start_ofs)
954{
955 struct bio_vec *bv;
956 unsigned long flags;
957 void *buf;
958 int i;
959 int pos = 0;
960
961 while (chain) {
962 bio_for_each_segment(bv, chain, i) {
963 if (pos + bv->bv_len > start_ofs) {
964 int remainder = max(start_ofs - pos, 0);
965 buf = bvec_kmap_irq(bv, &flags);
966 memset(buf + remainder, 0,
967 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200968 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700969 }
970 pos += bv->bv_len;
971 }
972
973 chain = chain->bi_next;
974 }
975}
976
977/*
Alex Elderb9434c52013-04-19 15:34:50 -0500978 * similar to zero_bio_chain(), zeros data defined by a page array,
979 * starting at the given byte offset from the start of the array and
980 * continuing up to the given end offset. The pages array is
981 * assumed to be big enough to hold all bytes up to the end.
982 */
983static void zero_pages(struct page **pages, u64 offset, u64 end)
984{
985 struct page **page = &pages[offset >> PAGE_SHIFT];
986
987 rbd_assert(end > offset);
988 rbd_assert(end - offset <= (u64)SIZE_MAX);
989 while (offset < end) {
990 size_t page_offset;
991 size_t length;
992 unsigned long flags;
993 void *kaddr;
994
995 page_offset = (size_t)(offset & ~PAGE_MASK);
996 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
997 local_irq_save(flags);
998 kaddr = kmap_atomic(*page);
999 memset(kaddr + page_offset, 0, length);
1000 kunmap_atomic(kaddr);
1001 local_irq_restore(flags);
1002
1003 offset += length;
1004 page++;
1005 }
1006}
1007
1008/*
Alex Elderf7760da2012-10-20 22:17:27 -05001009 * Clone a portion of a bio, starting at the given byte offset
1010 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001011 */
Alex Elderf7760da2012-10-20 22:17:27 -05001012static struct bio *bio_clone_range(struct bio *bio_src,
1013 unsigned int offset,
1014 unsigned int len,
1015 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001016{
Alex Elderf7760da2012-10-20 22:17:27 -05001017 struct bio_vec *bv;
1018 unsigned int resid;
1019 unsigned short idx;
1020 unsigned int voff;
1021 unsigned short end_idx;
1022 unsigned short vcnt;
1023 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024
Alex Elderf7760da2012-10-20 22:17:27 -05001025 /* Handle the easy case for the caller */
1026
1027 if (!offset && len == bio_src->bi_size)
1028 return bio_clone(bio_src, gfpmask);
1029
1030 if (WARN_ON_ONCE(!len))
1031 return NULL;
1032 if (WARN_ON_ONCE(len > bio_src->bi_size))
1033 return NULL;
1034 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1035 return NULL;
1036
1037 /* Find first affected segment... */
1038
1039 resid = offset;
1040 __bio_for_each_segment(bv, bio_src, idx, 0) {
1041 if (resid < bv->bv_len)
1042 break;
1043 resid -= bv->bv_len;
1044 }
1045 voff = resid;
1046
1047 /* ...and the last affected segment */
1048
1049 resid += len;
1050 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1051 if (resid <= bv->bv_len)
1052 break;
1053 resid -= bv->bv_len;
1054 }
1055 vcnt = end_idx - idx + 1;
1056
1057 /* Build the clone */
1058
1059 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1060 if (!bio)
1061 return NULL; /* ENOMEM */
1062
1063 bio->bi_bdev = bio_src->bi_bdev;
1064 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1065 bio->bi_rw = bio_src->bi_rw;
1066 bio->bi_flags |= 1 << BIO_CLONED;
1067
1068 /*
1069 * Copy over our part of the bio_vec, then update the first
1070 * and last (or only) entries.
1071 */
1072 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1073 vcnt * sizeof (struct bio_vec));
1074 bio->bi_io_vec[0].bv_offset += voff;
1075 if (vcnt > 1) {
1076 bio->bi_io_vec[0].bv_len -= voff;
1077 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1078 } else {
1079 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 }
1081
Alex Elderf7760da2012-10-20 22:17:27 -05001082 bio->bi_vcnt = vcnt;
1083 bio->bi_size = len;
1084 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001085
Alex Elderf7760da2012-10-20 22:17:27 -05001086 return bio;
1087}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088
Alex Elderf7760da2012-10-20 22:17:27 -05001089/*
1090 * Clone a portion of a bio chain, starting at the given byte offset
1091 * into the first bio in the source chain and continuing for the
1092 * number of bytes indicated. The result is another bio chain of
1093 * exactly the given length, or a null pointer on error.
1094 *
1095 * The bio_src and offset parameters are both in-out. On entry they
1096 * refer to the first source bio and the offset into that bio where
1097 * the start of data to be cloned is located.
1098 *
1099 * On return, bio_src is updated to refer to the bio in the source
1100 * chain that contains first un-cloned byte, and *offset will
1101 * contain the offset of that byte within that bio.
1102 */
1103static struct bio *bio_chain_clone_range(struct bio **bio_src,
1104 unsigned int *offset,
1105 unsigned int len,
1106 gfp_t gfpmask)
1107{
1108 struct bio *bi = *bio_src;
1109 unsigned int off = *offset;
1110 struct bio *chain = NULL;
1111 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001112
Alex Elderf7760da2012-10-20 22:17:27 -05001113 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001114
Alex Elderf7760da2012-10-20 22:17:27 -05001115 if (!bi || off >= bi->bi_size || !len)
1116 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117
Alex Elderf7760da2012-10-20 22:17:27 -05001118 end = &chain;
1119 while (len) {
1120 unsigned int bi_size;
1121 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122
Alex Elderf5400b72012-11-01 10:17:15 -05001123 if (!bi) {
1124 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001125 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001126 }
Alex Elderf7760da2012-10-20 22:17:27 -05001127 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1128 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1129 if (!bio)
1130 goto out_err; /* ENOMEM */
1131
1132 *end = bio;
1133 end = &bio->bi_next;
1134
1135 off += bi_size;
1136 if (off == bi->bi_size) {
1137 bi = bi->bi_next;
1138 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001139 }
Alex Elderf7760da2012-10-20 22:17:27 -05001140 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001141 }
Alex Elderf7760da2012-10-20 22:17:27 -05001142 *bio_src = bi;
1143 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144
Alex Elderf7760da2012-10-20 22:17:27 -05001145 return chain;
1146out_err:
1147 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149 return NULL;
1150}
1151
Alex Elder926f9b32013-02-11 12:33:24 -06001152/*
1153 * The default/initial value for all object request flags is 0. For
1154 * each flag, once its value is set to 1 it is never reset to 0
1155 * again.
1156 */
Alex Elder6365d332013-02-11 12:33:24 -06001157static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1158{
1159 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001160 struct rbd_device *rbd_dev;
1161
Alex Elder57acbaa2013-02-11 12:33:24 -06001162 rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder6365d332013-02-11 12:33:24 -06001163 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1164 obj_request);
1165 }
1166}
1167
1168static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1169{
1170 smp_mb();
1171 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1172}
1173
Alex Elder57acbaa2013-02-11 12:33:24 -06001174static void obj_request_done_set(struct rbd_obj_request *obj_request)
1175{
1176 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1177 struct rbd_device *rbd_dev = NULL;
1178
1179 if (obj_request_img_data_test(obj_request))
1180 rbd_dev = obj_request->img_request->rbd_dev;
1181 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1182 obj_request);
1183 }
1184}
1185
1186static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1187{
1188 smp_mb();
1189 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1190}
1191
Alex Elder5679c592013-02-11 12:33:24 -06001192/*
1193 * This sets the KNOWN flag after (possibly) setting the EXISTS
1194 * flag. The latter is set based on the "exists" value provided.
1195 *
1196 * Note that for our purposes once an object exists it never goes
1197 * away again. It's possible that the response from two existence
1198 * checks are separated by the creation of the target object, and
1199 * the first ("doesn't exist") response arrives *after* the second
1200 * ("does exist"). In that case we ignore the second one.
1201 */
1202static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1203 bool exists)
1204{
1205 if (exists)
1206 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1207 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1208 smp_mb();
1209}
1210
1211static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1212{
1213 smp_mb();
1214 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1215}
1216
1217static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1218{
1219 smp_mb();
1220 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1221}
1222
Alex Elderbf0d5f502012-11-22 00:00:08 -06001223static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1224{
Alex Elder37206ee2013-02-20 17:32:08 -06001225 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1226 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001227 kref_get(&obj_request->kref);
1228}
1229
1230static void rbd_obj_request_destroy(struct kref *kref);
1231static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1232{
1233 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001234 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1235 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001236 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1237}
1238
1239static void rbd_img_request_get(struct rbd_img_request *img_request)
1240{
Alex Elder37206ee2013-02-20 17:32:08 -06001241 dout("%s: img %p (was %d)\n", __func__, img_request,
1242 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001243 kref_get(&img_request->kref);
1244}
1245
1246static void rbd_img_request_destroy(struct kref *kref);
1247static void rbd_img_request_put(struct rbd_img_request *img_request)
1248{
1249 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001250 dout("%s: img %p (was %d)\n", __func__, img_request,
1251 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001252 kref_put(&img_request->kref, rbd_img_request_destroy);
1253}
1254
1255static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1256 struct rbd_obj_request *obj_request)
1257{
Alex Elder25dcf952013-01-25 17:08:55 -06001258 rbd_assert(obj_request->img_request == NULL);
1259
Alex Elderb155e862013-04-15 14:50:37 -05001260 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001261 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001262 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001263 rbd_assert(!obj_request_img_data_test(obj_request));
1264 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001265 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001266 img_request->obj_request_count++;
1267 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001268 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1269 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001270}
1271
1272static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1273 struct rbd_obj_request *obj_request)
1274{
1275 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001276
Alex Elder37206ee2013-02-20 17:32:08 -06001277 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1278 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001279 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001280 rbd_assert(img_request->obj_request_count > 0);
1281 img_request->obj_request_count--;
1282 rbd_assert(obj_request->which == img_request->obj_request_count);
1283 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001284 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001285 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001286 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001287 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001288 rbd_obj_request_put(obj_request);
1289}
1290
1291static bool obj_request_type_valid(enum obj_request_type type)
1292{
1293 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001294 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001295 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001296 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001297 return true;
1298 default:
1299 return false;
1300 }
1301}
1302
Alex Elderbf0d5f502012-11-22 00:00:08 -06001303static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1304 struct rbd_obj_request *obj_request)
1305{
Alex Elder37206ee2013-02-20 17:32:08 -06001306 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1307
Alex Elderbf0d5f502012-11-22 00:00:08 -06001308 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1309}
1310
1311static void rbd_img_request_complete(struct rbd_img_request *img_request)
1312{
Alex Elder55f27e02013-04-10 12:34:25 -05001313
Alex Elder37206ee2013-02-20 17:32:08 -06001314 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001315
1316 /*
1317 * If no error occurred, compute the aggregate transfer
1318 * count for the image request. We could instead use
1319 * atomic64_cmpxchg() to update it as each object request
1320 * completes; not clear which way is better off hand.
1321 */
1322 if (!img_request->result) {
1323 struct rbd_obj_request *obj_request;
1324 u64 xferred = 0;
1325
1326 for_each_obj_request(img_request, obj_request)
1327 xferred += obj_request->xferred;
1328 img_request->xferred = xferred;
1329 }
1330
Alex Elderbf0d5f502012-11-22 00:00:08 -06001331 if (img_request->callback)
1332 img_request->callback(img_request);
1333 else
1334 rbd_img_request_put(img_request);
1335}
1336
Alex Elder788e2df2013-01-17 12:25:27 -06001337/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1338
1339static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1340{
Alex Elder37206ee2013-02-20 17:32:08 -06001341 dout("%s: obj %p\n", __func__, obj_request);
1342
Alex Elder788e2df2013-01-17 12:25:27 -06001343 return wait_for_completion_interruptible(&obj_request->completion);
1344}
1345
Alex Elder0c425242013-02-08 09:55:49 -06001346/*
1347 * The default/initial value for all image request flags is 0. Each
1348 * is conditionally set to 1 at image request initialization time
1349 * and currently never change thereafter.
1350 */
1351static void img_request_write_set(struct rbd_img_request *img_request)
1352{
1353 set_bit(IMG_REQ_WRITE, &img_request->flags);
1354 smp_mb();
1355}
1356
1357static bool img_request_write_test(struct rbd_img_request *img_request)
1358{
1359 smp_mb();
1360 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1361}
1362
Alex Elder9849e982013-01-24 16:13:36 -06001363static void img_request_child_set(struct rbd_img_request *img_request)
1364{
1365 set_bit(IMG_REQ_CHILD, &img_request->flags);
1366 smp_mb();
1367}
1368
1369static bool img_request_child_test(struct rbd_img_request *img_request)
1370{
1371 smp_mb();
1372 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1373}
1374
Alex Elderd0b2e942013-01-24 16:13:36 -06001375static void img_request_layered_set(struct rbd_img_request *img_request)
1376{
1377 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1378 smp_mb();
1379}
1380
1381static bool img_request_layered_test(struct rbd_img_request *img_request)
1382{
1383 smp_mb();
1384 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1385}
1386
Alex Elder6e2a4502013-03-27 09:16:30 -05001387static void
1388rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1389{
Alex Elderb9434c52013-04-19 15:34:50 -05001390 u64 xferred = obj_request->xferred;
1391 u64 length = obj_request->length;
1392
Alex Elder6e2a4502013-03-27 09:16:30 -05001393 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1394 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001395 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001396 /*
1397 * ENOENT means a hole in the image. We zero-fill the
1398 * entire length of the request. A short read also implies
1399 * zero-fill to the end of the request. Either way we
1400 * update the xferred count to indicate the whole request
1401 * was satisfied.
1402 */
Alex Elderb9434c52013-04-19 15:34:50 -05001403 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001404 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001405 if (obj_request->type == OBJ_REQUEST_BIO)
1406 zero_bio_chain(obj_request->bio_list, 0);
1407 else
1408 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001409 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001410 obj_request->xferred = length;
1411 } else if (xferred < length && !obj_request->result) {
1412 if (obj_request->type == OBJ_REQUEST_BIO)
1413 zero_bio_chain(obj_request->bio_list, xferred);
1414 else
1415 zero_pages(obj_request->pages, xferred, length);
1416 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001417 }
1418 obj_request_done_set(obj_request);
1419}
1420
Alex Elderbf0d5f502012-11-22 00:00:08 -06001421static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1422{
Alex Elder37206ee2013-02-20 17:32:08 -06001423 dout("%s: obj %p cb %p\n", __func__, obj_request,
1424 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001425 if (obj_request->callback)
1426 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001427 else
1428 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001429}
1430
Alex Elderc47f9372013-02-26 14:23:07 -06001431static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001432{
1433 dout("%s: obj %p\n", __func__, obj_request);
1434 obj_request_done_set(obj_request);
1435}
1436
Alex Elderc47f9372013-02-26 14:23:07 -06001437static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001438{
Alex Elder57acbaa2013-02-11 12:33:24 -06001439 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001440 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001441 bool layered = false;
1442
1443 if (obj_request_img_data_test(obj_request)) {
1444 img_request = obj_request->img_request;
1445 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001446 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001447 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001448
1449 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1450 obj_request, img_request, obj_request->result,
1451 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001452 if (layered && obj_request->result == -ENOENT &&
1453 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001454 rbd_img_parent_read(obj_request);
1455 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001456 rbd_img_obj_request_read_callback(obj_request);
1457 else
1458 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001459}
1460
Alex Elderc47f9372013-02-26 14:23:07 -06001461static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001462{
Sage Weil1b83bef2013-02-25 16:11:12 -08001463 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1464 obj_request->result, obj_request->length);
1465 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001466 * There is no such thing as a successful short write. Set
1467 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001468 */
1469 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001470 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001471}
1472
Alex Elderfbfab532013-02-08 09:55:48 -06001473/*
1474 * For a simple stat call there's nothing to do. We'll do more if
1475 * this is part of a write sequence for a layered image.
1476 */
Alex Elderc47f9372013-02-26 14:23:07 -06001477static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001478{
Alex Elder37206ee2013-02-20 17:32:08 -06001479 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001480 obj_request_done_set(obj_request);
1481}
1482
Alex Elderbf0d5f502012-11-22 00:00:08 -06001483static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1484 struct ceph_msg *msg)
1485{
1486 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001487 u16 opcode;
1488
Alex Elder37206ee2013-02-20 17:32:08 -06001489 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001490 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001491 if (obj_request_img_data_test(obj_request)) {
1492 rbd_assert(obj_request->img_request);
1493 rbd_assert(obj_request->which != BAD_WHICH);
1494 } else {
1495 rbd_assert(obj_request->which == BAD_WHICH);
1496 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001497
Sage Weil1b83bef2013-02-25 16:11:12 -08001498 if (osd_req->r_result < 0)
1499 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001500 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1501
Alex Elder0eefd472013-04-19 15:34:50 -05001502 BUG_ON(osd_req->r_num_ops > 2);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001503
Alex Elderc47f9372013-02-26 14:23:07 -06001504 /*
1505 * We support a 64-bit length, but ultimately it has to be
1506 * passed to blk_end_request(), which takes an unsigned int.
1507 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001508 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elder8b3e1a52013-01-24 16:13:36 -06001509 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Alex Elder79528732013-04-03 21:32:51 -05001510 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001511 switch (opcode) {
1512 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001513 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001514 break;
1515 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001516 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001517 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001518 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001519 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001520 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001521 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001522 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001523 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001524 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001525 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001526 default:
1527 rbd_warn(NULL, "%s: unsupported op %hu\n",
1528 obj_request->object_name, (unsigned short) opcode);
1529 break;
1530 }
1531
Alex Elder07741302013-02-05 23:41:50 -06001532 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001533 rbd_obj_request_complete(obj_request);
1534}
1535
Alex Elder9d4df012013-04-19 15:34:50 -05001536static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001537{
1538 struct rbd_img_request *img_request = obj_request->img_request;
Alex Elder8c042b02013-04-03 01:28:58 -05001539 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001540 u64 snap_id;
Alex Elder430c28c2013-04-03 21:32:51 -05001541
Alex Elder8c042b02013-04-03 01:28:58 -05001542 rbd_assert(osd_req != NULL);
Alex Elder430c28c2013-04-03 21:32:51 -05001543
Alex Elder9d4df012013-04-19 15:34:50 -05001544 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
Alex Elder8c042b02013-04-03 01:28:58 -05001545 ceph_osdc_build_request(osd_req, obj_request->offset,
Alex Elder9d4df012013-04-19 15:34:50 -05001546 NULL, snap_id, NULL);
1547}
1548
1549static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1550{
1551 struct rbd_img_request *img_request = obj_request->img_request;
1552 struct ceph_osd_request *osd_req = obj_request->osd_req;
1553 struct ceph_snap_context *snapc;
1554 struct timespec mtime = CURRENT_TIME;
1555
1556 rbd_assert(osd_req != NULL);
1557
1558 snapc = img_request ? img_request->snapc : NULL;
1559 ceph_osdc_build_request(osd_req, obj_request->offset,
1560 snapc, CEPH_NOSNAP, &mtime);
Alex Elder430c28c2013-04-03 21:32:51 -05001561}
1562
Alex Elderbf0d5f502012-11-22 00:00:08 -06001563static struct ceph_osd_request *rbd_osd_req_create(
1564 struct rbd_device *rbd_dev,
1565 bool write_request,
Alex Elder430c28c2013-04-03 21:32:51 -05001566 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001568 struct ceph_snap_context *snapc = NULL;
1569 struct ceph_osd_client *osdc;
1570 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001571
Alex Elder6365d332013-02-11 12:33:24 -06001572 if (obj_request_img_data_test(obj_request)) {
1573 struct rbd_img_request *img_request = obj_request->img_request;
1574
Alex Elder0c425242013-02-08 09:55:49 -06001575 rbd_assert(write_request ==
1576 img_request_write_test(img_request));
1577 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001578 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001579 }
1580
1581 /* Allocate and initialize the request, for the single op */
1582
1583 osdc = &rbd_dev->rbd_client->client->osdc;
1584 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1585 if (!osd_req)
1586 return NULL; /* ENOMEM */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001587
Alex Elder430c28c2013-04-03 21:32:51 -05001588 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001589 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05001590 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06001591 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001592
1593 osd_req->r_callback = rbd_osd_req_callback;
1594 osd_req->r_priv = obj_request;
1595
1596 osd_req->r_oid_len = strlen(obj_request->object_name);
1597 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1598 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1599
1600 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1601
Alex Elderbf0d5f502012-11-22 00:00:08 -06001602 return osd_req;
1603}
1604
Alex Elder0eefd472013-04-19 15:34:50 -05001605/*
1606 * Create a copyup osd request based on the information in the
1607 * object request supplied. A copyup request has two osd ops,
1608 * a copyup method call, and a "normal" write request.
1609 */
1610static struct ceph_osd_request *
1611rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1612{
1613 struct rbd_img_request *img_request;
1614 struct ceph_snap_context *snapc;
1615 struct rbd_device *rbd_dev;
1616 struct ceph_osd_client *osdc;
1617 struct ceph_osd_request *osd_req;
1618
1619 rbd_assert(obj_request_img_data_test(obj_request));
1620 img_request = obj_request->img_request;
1621 rbd_assert(img_request);
1622 rbd_assert(img_request_write_test(img_request));
1623
1624 /* Allocate and initialize the request, for the two ops */
1625
1626 snapc = img_request->snapc;
1627 rbd_dev = img_request->rbd_dev;
1628 osdc = &rbd_dev->rbd_client->client->osdc;
1629 osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1630 if (!osd_req)
1631 return NULL; /* ENOMEM */
1632
1633 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1634 osd_req->r_callback = rbd_osd_req_callback;
1635 osd_req->r_priv = obj_request;
1636
1637 osd_req->r_oid_len = strlen(obj_request->object_name);
1638 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1639 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1640
1641 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1642
1643 return osd_req;
1644}
1645
1646
Alex Elderbf0d5f502012-11-22 00:00:08 -06001647static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1648{
1649 ceph_osdc_put_request(osd_req);
1650}
1651
1652/* object_name is assumed to be a non-null pointer and NUL-terminated */
1653
1654static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1655 u64 offset, u64 length,
1656 enum obj_request_type type)
1657{
1658 struct rbd_obj_request *obj_request;
1659 size_t size;
1660 char *name;
1661
1662 rbd_assert(obj_request_type_valid(type));
1663
1664 size = strlen(object_name) + 1;
1665 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1666 if (!obj_request)
1667 return NULL;
1668
1669 name = (char *)(obj_request + 1);
1670 obj_request->object_name = memcpy(name, object_name, size);
1671 obj_request->offset = offset;
1672 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06001673 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001674 obj_request->which = BAD_WHICH;
1675 obj_request->type = type;
1676 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06001677 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001678 kref_init(&obj_request->kref);
1679
Alex Elder37206ee2013-02-20 17:32:08 -06001680 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1681 offset, length, (int)type, obj_request);
1682
Alex Elderbf0d5f502012-11-22 00:00:08 -06001683 return obj_request;
1684}
1685
1686static void rbd_obj_request_destroy(struct kref *kref)
1687{
1688 struct rbd_obj_request *obj_request;
1689
1690 obj_request = container_of(kref, struct rbd_obj_request, kref);
1691
Alex Elder37206ee2013-02-20 17:32:08 -06001692 dout("%s: obj %p\n", __func__, obj_request);
1693
Alex Elderbf0d5f502012-11-22 00:00:08 -06001694 rbd_assert(obj_request->img_request == NULL);
1695 rbd_assert(obj_request->which == BAD_WHICH);
1696
1697 if (obj_request->osd_req)
1698 rbd_osd_req_destroy(obj_request->osd_req);
1699
1700 rbd_assert(obj_request_type_valid(obj_request->type));
1701 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001702 case OBJ_REQUEST_NODATA:
1703 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001704 case OBJ_REQUEST_BIO:
1705 if (obj_request->bio_list)
1706 bio_chain_put(obj_request->bio_list);
1707 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001708 case OBJ_REQUEST_PAGES:
1709 if (obj_request->pages)
1710 ceph_release_page_vector(obj_request->pages,
1711 obj_request->page_count);
1712 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001713 }
1714
1715 kfree(obj_request);
1716}
1717
1718/*
1719 * Caller is responsible for filling in the list of object requests
1720 * that comprises the image request, and the Linux request pointer
1721 * (if there is one).
1722 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001723static struct rbd_img_request *rbd_img_request_create(
1724 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001725 u64 offset, u64 length,
Alex Elder9849e982013-01-24 16:13:36 -06001726 bool write_request,
1727 bool child_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001728{
1729 struct rbd_img_request *img_request;
1730 struct ceph_snap_context *snapc = NULL;
1731
1732 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1733 if (!img_request)
1734 return NULL;
1735
1736 if (write_request) {
1737 down_read(&rbd_dev->header_rwsem);
1738 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1739 up_read(&rbd_dev->header_rwsem);
1740 if (WARN_ON(!snapc)) {
1741 kfree(img_request);
1742 return NULL; /* Shouldn't happen */
1743 }
Alex Elder0c425242013-02-08 09:55:49 -06001744
Alex Elderbf0d5f502012-11-22 00:00:08 -06001745 }
1746
1747 img_request->rq = NULL;
1748 img_request->rbd_dev = rbd_dev;
1749 img_request->offset = offset;
1750 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06001751 img_request->flags = 0;
1752 if (write_request) {
1753 img_request_write_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001754 img_request->snapc = snapc;
Alex Elder0c425242013-02-08 09:55:49 -06001755 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001756 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06001757 }
Alex Elder9849e982013-01-24 16:13:36 -06001758 if (child_request)
1759 img_request_child_set(img_request);
Alex Elderd0b2e942013-01-24 16:13:36 -06001760 if (rbd_dev->parent_spec)
1761 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001762 spin_lock_init(&img_request->completion_lock);
1763 img_request->next_completion = 0;
1764 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06001765 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001766 img_request->obj_request_count = 0;
1767 INIT_LIST_HEAD(&img_request->obj_requests);
1768 kref_init(&img_request->kref);
1769
1770 rbd_img_request_get(img_request); /* Avoid a warning */
1771 rbd_img_request_put(img_request); /* TEMPORARY */
1772
Alex Elder37206ee2013-02-20 17:32:08 -06001773 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1774 write_request ? "write" : "read", offset, length,
1775 img_request);
1776
Alex Elderbf0d5f502012-11-22 00:00:08 -06001777 return img_request;
1778}
1779
1780static void rbd_img_request_destroy(struct kref *kref)
1781{
1782 struct rbd_img_request *img_request;
1783 struct rbd_obj_request *obj_request;
1784 struct rbd_obj_request *next_obj_request;
1785
1786 img_request = container_of(kref, struct rbd_img_request, kref);
1787
Alex Elder37206ee2013-02-20 17:32:08 -06001788 dout("%s: img %p\n", __func__, img_request);
1789
Alex Elderbf0d5f502012-11-22 00:00:08 -06001790 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1791 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001792 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001793
Alex Elder0c425242013-02-08 09:55:49 -06001794 if (img_request_write_test(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001795 ceph_put_snap_context(img_request->snapc);
1796
Alex Elder8b3e1a52013-01-24 16:13:36 -06001797 if (img_request_child_test(img_request))
1798 rbd_obj_request_put(img_request->obj_request);
1799
Alex Elderbf0d5f502012-11-22 00:00:08 -06001800 kfree(img_request);
1801}
1802
Alex Elder12178572013-02-08 09:55:49 -06001803static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1804{
Alex Elder6365d332013-02-11 12:33:24 -06001805 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001806 unsigned int xferred;
1807 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001808 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001809
Alex Elder6365d332013-02-11 12:33:24 -06001810 rbd_assert(obj_request_img_data_test(obj_request));
1811 img_request = obj_request->img_request;
1812
Alex Elder12178572013-02-08 09:55:49 -06001813 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1814 xferred = (unsigned int)obj_request->xferred;
1815 result = obj_request->result;
1816 if (result) {
1817 struct rbd_device *rbd_dev = img_request->rbd_dev;
1818
1819 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1820 img_request_write_test(img_request) ? "write" : "read",
1821 obj_request->length, obj_request->img_offset,
1822 obj_request->offset);
1823 rbd_warn(rbd_dev, " result %d xferred %x\n",
1824 result, xferred);
1825 if (!img_request->result)
1826 img_request->result = result;
1827 }
1828
Alex Elderf1a47392013-04-19 15:34:50 -05001829 /* Image object requests don't own their page array */
1830
1831 if (obj_request->type == OBJ_REQUEST_PAGES) {
1832 obj_request->pages = NULL;
1833 obj_request->page_count = 0;
1834 }
1835
Alex Elder8b3e1a52013-01-24 16:13:36 -06001836 if (img_request_child_test(img_request)) {
1837 rbd_assert(img_request->obj_request != NULL);
1838 more = obj_request->which < img_request->obj_request_count - 1;
1839 } else {
1840 rbd_assert(img_request->rq != NULL);
1841 more = blk_end_request(img_request->rq, result, xferred);
1842 }
1843
1844 return more;
Alex Elder12178572013-02-08 09:55:49 -06001845}
1846
Alex Elder21692382013-04-05 01:27:12 -05001847static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1848{
1849 struct rbd_img_request *img_request;
1850 u32 which = obj_request->which;
1851 bool more = true;
1852
Alex Elder6365d332013-02-11 12:33:24 -06001853 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001854 img_request = obj_request->img_request;
1855
1856 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1857 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001858 rbd_assert(img_request->obj_request_count > 0);
1859 rbd_assert(which != BAD_WHICH);
1860 rbd_assert(which < img_request->obj_request_count);
1861 rbd_assert(which >= img_request->next_completion);
1862
1863 spin_lock_irq(&img_request->completion_lock);
1864 if (which != img_request->next_completion)
1865 goto out;
1866
1867 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001868 rbd_assert(more);
1869 rbd_assert(which < img_request->obj_request_count);
1870
1871 if (!obj_request_done_test(obj_request))
1872 break;
Alex Elder12178572013-02-08 09:55:49 -06001873 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001874 which++;
1875 }
1876
1877 rbd_assert(more ^ (which == img_request->obj_request_count));
1878 img_request->next_completion = which;
1879out:
1880 spin_unlock_irq(&img_request->completion_lock);
1881
1882 if (!more)
1883 rbd_img_request_complete(img_request);
1884}
1885
Alex Elderf1a47392013-04-19 15:34:50 -05001886/*
1887 * Split up an image request into one or more object requests, each
1888 * to a different object. The "type" parameter indicates whether
1889 * "data_desc" is the pointer to the head of a list of bio
1890 * structures, or the base of a page array. In either case this
1891 * function assumes data_desc describes memory sufficient to hold
1892 * all data described by the image request.
1893 */
1894static int rbd_img_request_fill(struct rbd_img_request *img_request,
1895 enum obj_request_type type,
1896 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001897{
1898 struct rbd_device *rbd_dev = img_request->rbd_dev;
1899 struct rbd_obj_request *obj_request = NULL;
1900 struct rbd_obj_request *next_obj_request;
Alex Elder0c425242013-02-08 09:55:49 -06001901 bool write_request = img_request_write_test(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05001902 struct bio *bio_list;
1903 unsigned int bio_offset = 0;
1904 struct page **pages;
Alex Elder7da22d22013-01-24 16:13:36 -06001905 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001906 u64 resid;
1907 u16 opcode;
1908
Alex Elderf1a47392013-04-19 15:34:50 -05001909 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1910 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06001911
Alex Elder430c28c2013-04-03 21:32:51 -05001912 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
Alex Elder7da22d22013-01-24 16:13:36 -06001913 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001914 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06001915 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001916
1917 if (type == OBJ_REQUEST_BIO) {
1918 bio_list = data_desc;
1919 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1920 } else {
1921 rbd_assert(type == OBJ_REQUEST_PAGES);
1922 pages = data_desc;
1923 }
1924
Alex Elderbf0d5f502012-11-22 00:00:08 -06001925 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05001926 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001927 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001928 u64 offset;
1929 u64 length;
1930
Alex Elder7da22d22013-01-24 16:13:36 -06001931 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001932 if (!object_name)
1933 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06001934 offset = rbd_segment_offset(rbd_dev, img_offset);
1935 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001936 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05001937 offset, length, type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001938 kfree(object_name); /* object request has its own copy */
1939 if (!obj_request)
1940 goto out_unwind;
1941
Alex Elderf1a47392013-04-19 15:34:50 -05001942 if (type == OBJ_REQUEST_BIO) {
1943 unsigned int clone_size;
1944
1945 rbd_assert(length <= (u64)UINT_MAX);
1946 clone_size = (unsigned int)length;
1947 obj_request->bio_list =
1948 bio_chain_clone_range(&bio_list,
1949 &bio_offset,
1950 clone_size,
1951 GFP_ATOMIC);
1952 if (!obj_request->bio_list)
1953 goto out_partial;
1954 } else {
1955 unsigned int page_count;
1956
1957 obj_request->pages = pages;
1958 page_count = (u32)calc_pages_for(offset, length);
1959 obj_request->page_count = page_count;
1960 if ((offset + length) & ~PAGE_MASK)
1961 page_count--; /* more on last page */
1962 pages += page_count;
1963 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001964
Alex Elder2fa12322013-04-05 01:27:12 -05001965 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1966 obj_request);
1967 if (!osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001968 goto out_partial;
Alex Elder2fa12322013-04-05 01:27:12 -05001969 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05001970 obj_request->callback = rbd_img_obj_callback;
Alex Elder430c28c2013-04-03 21:32:51 -05001971
Alex Elder2fa12322013-04-05 01:27:12 -05001972 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1973 0, 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001974 if (type == OBJ_REQUEST_BIO)
1975 osd_req_op_extent_osd_data_bio(osd_req, 0,
1976 obj_request->bio_list, length);
1977 else
1978 osd_req_op_extent_osd_data_pages(osd_req, 0,
1979 obj_request->pages, length,
1980 offset & ~PAGE_MASK, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05001981
1982 if (write_request)
1983 rbd_osd_req_format_write(obj_request);
1984 else
1985 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05001986
Alex Elder7da22d22013-01-24 16:13:36 -06001987 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001988 rbd_img_obj_request_add(img_request, obj_request);
1989
Alex Elder7da22d22013-01-24 16:13:36 -06001990 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001991 resid -= length;
1992 }
1993
1994 return 0;
1995
1996out_partial:
1997 rbd_obj_request_put(obj_request);
1998out_unwind:
1999 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2000 rbd_obj_request_put(obj_request);
2001
2002 return -ENOMEM;
2003}
2004
Alex Elder3d7efd12013-04-19 15:34:50 -05002005static void
Alex Elder0eefd472013-04-19 15:34:50 -05002006rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2007{
2008 struct rbd_img_request *img_request;
2009 struct rbd_device *rbd_dev;
2010 u64 length;
2011 u32 page_count;
2012
2013 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2014 rbd_assert(obj_request_img_data_test(obj_request));
2015 img_request = obj_request->img_request;
2016 rbd_assert(img_request);
2017
2018 rbd_dev = img_request->rbd_dev;
2019 rbd_assert(rbd_dev);
2020 length = (u64)1 << rbd_dev->header.obj_order;
2021 page_count = (u32)calc_pages_for(0, length);
2022
2023 rbd_assert(obj_request->copyup_pages);
2024 ceph_release_page_vector(obj_request->copyup_pages, page_count);
2025 obj_request->copyup_pages = NULL;
2026
2027 /*
2028 * We want the transfer count to reflect the size of the
2029 * original write request. There is no such thing as a
2030 * successful short write, so if the request was successful
2031 * we can just set it to the originally-requested length.
2032 */
2033 if (!obj_request->result)
2034 obj_request->xferred = obj_request->length;
2035
2036 /* Finish up with the normal image object callback */
2037
2038 rbd_img_obj_callback(obj_request);
2039}
2040
2041static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002042rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2043{
2044 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002045 struct ceph_osd_request *osd_req;
2046 struct ceph_osd_client *osdc;
2047 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002048 struct page **pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002049 int result;
2050 u64 obj_size;
2051 u64 xferred;
2052
2053 rbd_assert(img_request_child_test(img_request));
2054
2055 /* First get what we need from the image request */
2056
2057 pages = img_request->copyup_pages;
2058 rbd_assert(pages != NULL);
2059 img_request->copyup_pages = NULL;
2060
2061 orig_request = img_request->obj_request;
2062 rbd_assert(orig_request != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002063 rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
Alex Elder3d7efd12013-04-19 15:34:50 -05002064 result = img_request->result;
2065 obj_size = img_request->length;
2066 xferred = img_request->xferred;
2067
Alex Elder0eefd472013-04-19 15:34:50 -05002068 rbd_dev = img_request->rbd_dev;
2069 rbd_assert(rbd_dev);
2070 rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2071
Alex Elder3d7efd12013-04-19 15:34:50 -05002072 rbd_img_request_put(img_request);
2073
Alex Elder0eefd472013-04-19 15:34:50 -05002074 if (result)
2075 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002076
Alex Elder0eefd472013-04-19 15:34:50 -05002077 /* Allocate the new copyup osd request for the original request */
Alex Elder3d7efd12013-04-19 15:34:50 -05002078
Alex Elder0eefd472013-04-19 15:34:50 -05002079 result = -ENOMEM;
2080 rbd_assert(!orig_request->osd_req);
2081 osd_req = rbd_osd_req_create_copyup(orig_request);
2082 if (!osd_req)
2083 goto out_err;
2084 orig_request->osd_req = osd_req;
2085 orig_request->copyup_pages = pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002086
Alex Elder0eefd472013-04-19 15:34:50 -05002087 /* Initialize the copyup op */
2088
2089 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2090 osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2091 false, false);
2092
2093 /* Then the original write request op */
2094
2095 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2096 orig_request->offset,
2097 orig_request->length, 0, 0);
2098 osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2099 orig_request->length);
2100
2101 rbd_osd_req_format_write(orig_request);
2102
2103 /* All set, send it off. */
2104
2105 orig_request->callback = rbd_img_obj_copyup_callback;
2106 osdc = &rbd_dev->rbd_client->client->osdc;
2107 result = rbd_obj_request_submit(osdc, orig_request);
2108 if (!result)
2109 return;
2110out_err:
2111 /* Record the error code and complete the request */
2112
2113 orig_request->result = result;
2114 orig_request->xferred = 0;
2115 obj_request_done_set(orig_request);
2116 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002117}
2118
2119/*
2120 * Read from the parent image the range of data that covers the
2121 * entire target of the given object request. This is used for
2122 * satisfying a layered image write request when the target of an
2123 * object request from the image request does not exist.
2124 *
2125 * A page array big enough to hold the returned data is allocated
2126 * and supplied to rbd_img_request_fill() as the "data descriptor."
2127 * When the read completes, this page array will be transferred to
2128 * the original object request for the copyup operation.
2129 *
2130 * If an error occurs, record it as the result of the original
2131 * object request and mark it done so it gets completed.
2132 */
2133static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2134{
2135 struct rbd_img_request *img_request = NULL;
2136 struct rbd_img_request *parent_request = NULL;
2137 struct rbd_device *rbd_dev;
2138 u64 img_offset;
2139 u64 length;
2140 struct page **pages = NULL;
2141 u32 page_count;
2142 int result;
2143
2144 rbd_assert(obj_request_img_data_test(obj_request));
2145 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2146
2147 img_request = obj_request->img_request;
2148 rbd_assert(img_request != NULL);
2149 rbd_dev = img_request->rbd_dev;
2150 rbd_assert(rbd_dev->parent != NULL);
2151
2152 /*
Alex Elder0eefd472013-04-19 15:34:50 -05002153 * First things first. The original osd request is of no
2154 * use to use any more, we'll need a new one that can hold
2155 * the two ops in a copyup request. We'll get that later,
2156 * but for now we can release the old one.
2157 */
2158 rbd_osd_req_destroy(obj_request->osd_req);
2159 obj_request->osd_req = NULL;
2160
2161 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002162 * Determine the byte range covered by the object in the
2163 * child image to which the original request was to be sent.
2164 */
2165 img_offset = obj_request->img_offset - obj_request->offset;
2166 length = (u64)1 << rbd_dev->header.obj_order;
2167
2168 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002169 * There is no defined parent data beyond the parent
2170 * overlap, so limit what we read at that boundary if
2171 * necessary.
2172 */
2173 if (img_offset + length > rbd_dev->parent_overlap) {
2174 rbd_assert(img_offset < rbd_dev->parent_overlap);
2175 length = rbd_dev->parent_overlap - img_offset;
2176 }
2177
2178 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002179 * Allocate a page array big enough to receive the data read
2180 * from the parent.
2181 */
2182 page_count = (u32)calc_pages_for(0, length);
2183 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2184 if (IS_ERR(pages)) {
2185 result = PTR_ERR(pages);
2186 pages = NULL;
2187 goto out_err;
2188 }
2189
2190 result = -ENOMEM;
2191 parent_request = rbd_img_request_create(rbd_dev->parent,
2192 img_offset, length,
2193 false, true);
2194 if (!parent_request)
2195 goto out_err;
2196 rbd_obj_request_get(obj_request);
2197 parent_request->obj_request = obj_request;
2198
2199 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2200 if (result)
2201 goto out_err;
2202 parent_request->copyup_pages = pages;
2203
2204 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2205 result = rbd_img_request_submit(parent_request);
2206 if (!result)
2207 return 0;
2208
2209 parent_request->copyup_pages = NULL;
2210 parent_request->obj_request = NULL;
2211 rbd_obj_request_put(obj_request);
2212out_err:
2213 if (pages)
2214 ceph_release_page_vector(pages, page_count);
2215 if (parent_request)
2216 rbd_img_request_put(parent_request);
2217 obj_request->result = result;
2218 obj_request->xferred = 0;
2219 obj_request_done_set(obj_request);
2220
2221 return result;
2222}
2223
Alex Elderc5b5ef62013-02-11 12:33:24 -06002224static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2225{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002226 struct rbd_obj_request *orig_request;
2227 int result;
2228
2229 rbd_assert(!obj_request_img_data_test(obj_request));
2230
2231 /*
2232 * All we need from the object request is the original
2233 * request and the result of the STAT op. Grab those, then
2234 * we're done with the request.
2235 */
2236 orig_request = obj_request->obj_request;
2237 obj_request->obj_request = NULL;
2238 rbd_assert(orig_request);
2239 rbd_assert(orig_request->img_request);
2240
2241 result = obj_request->result;
2242 obj_request->result = 0;
2243
2244 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2245 obj_request, orig_request, result,
2246 obj_request->xferred, obj_request->length);
2247 rbd_obj_request_put(obj_request);
2248
2249 rbd_assert(orig_request);
2250 rbd_assert(orig_request->img_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002251
2252 /*
2253 * Our only purpose here is to determine whether the object
2254 * exists, and we don't want to treat the non-existence as
2255 * an error. If something else comes back, transfer the
2256 * error to the original request and complete it now.
2257 */
2258 if (!result) {
2259 obj_request_existence_set(orig_request, true);
2260 } else if (result == -ENOENT) {
2261 obj_request_existence_set(orig_request, false);
2262 } else if (result) {
2263 orig_request->result = result;
Alex Elder3d7efd12013-04-19 15:34:50 -05002264 goto out;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002265 }
2266
2267 /*
2268 * Resubmit the original request now that we have recorded
2269 * whether the target object exists.
2270 */
Alex Elderb454e362013-04-19 15:34:50 -05002271 orig_request->result = rbd_img_obj_request_submit(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002272out:
Alex Elderc5b5ef62013-02-11 12:33:24 -06002273 if (orig_request->result)
2274 rbd_obj_request_complete(orig_request);
2275 rbd_obj_request_put(orig_request);
2276}
2277
2278static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2279{
2280 struct rbd_obj_request *stat_request;
2281 struct rbd_device *rbd_dev;
2282 struct ceph_osd_client *osdc;
2283 struct page **pages = NULL;
2284 u32 page_count;
2285 size_t size;
2286 int ret;
2287
2288 /*
2289 * The response data for a STAT call consists of:
2290 * le64 length;
2291 * struct {
2292 * le32 tv_sec;
2293 * le32 tv_nsec;
2294 * } mtime;
2295 */
2296 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2297 page_count = (u32)calc_pages_for(0, size);
2298 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2299 if (IS_ERR(pages))
2300 return PTR_ERR(pages);
2301
2302 ret = -ENOMEM;
2303 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2304 OBJ_REQUEST_PAGES);
2305 if (!stat_request)
2306 goto out;
2307
2308 rbd_obj_request_get(obj_request);
2309 stat_request->obj_request = obj_request;
2310 stat_request->pages = pages;
2311 stat_request->page_count = page_count;
2312
2313 rbd_assert(obj_request->img_request);
2314 rbd_dev = obj_request->img_request->rbd_dev;
2315 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2316 stat_request);
2317 if (!stat_request->osd_req)
2318 goto out;
2319 stat_request->callback = rbd_img_obj_exists_callback;
2320
2321 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2322 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2323 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002324 rbd_osd_req_format_read(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002325
2326 osdc = &rbd_dev->rbd_client->client->osdc;
2327 ret = rbd_obj_request_submit(osdc, stat_request);
2328out:
2329 if (ret)
2330 rbd_obj_request_put(obj_request);
2331
2332 return ret;
2333}
2334
Alex Elderb454e362013-04-19 15:34:50 -05002335static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2336{
2337 struct rbd_img_request *img_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002338 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002339 bool known;
Alex Elderb454e362013-04-19 15:34:50 -05002340
2341 rbd_assert(obj_request_img_data_test(obj_request));
2342
2343 img_request = obj_request->img_request;
2344 rbd_assert(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002345 rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002346
Alex Elderb454e362013-04-19 15:34:50 -05002347 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002348 * Only writes to layered images need special handling.
2349 * Reads and non-layered writes are simple object requests.
2350 * Layered writes that start beyond the end of the overlap
2351 * with the parent have no parent data, so they too are
2352 * simple object requests. Finally, if the target object is
2353 * known to already exist, its parent data has already been
2354 * copied, so a write to the object can also be handled as a
2355 * simple object request.
Alex Elderb454e362013-04-19 15:34:50 -05002356 */
2357 if (!img_request_write_test(img_request) ||
2358 !img_request_layered_test(img_request) ||
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002359 rbd_dev->parent_overlap <= obj_request->img_offset ||
Alex Elder3d7efd12013-04-19 15:34:50 -05002360 ((known = obj_request_known_test(obj_request)) &&
2361 obj_request_exists_test(obj_request))) {
Alex Elderb454e362013-04-19 15:34:50 -05002362
2363 struct rbd_device *rbd_dev;
2364 struct ceph_osd_client *osdc;
2365
2366 rbd_dev = obj_request->img_request->rbd_dev;
2367 osdc = &rbd_dev->rbd_client->client->osdc;
2368
2369 return rbd_obj_request_submit(osdc, obj_request);
2370 }
2371
2372 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002373 * It's a layered write. The target object might exist but
2374 * we may not know that yet. If we know it doesn't exist,
2375 * start by reading the data for the full target object from
2376 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002377 */
Alex Elder3d7efd12013-04-19 15:34:50 -05002378 if (known)
2379 return rbd_img_obj_parent_read_full(obj_request);
2380
2381 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002382
2383 return rbd_img_obj_exists_submit(obj_request);
2384}
2385
Alex Elderbf0d5f502012-11-22 00:00:08 -06002386static int rbd_img_request_submit(struct rbd_img_request *img_request)
2387{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002388 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002389 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002390
Alex Elder37206ee2013-02-20 17:32:08 -06002391 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05002392 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002393 int ret;
2394
Alex Elderb454e362013-04-19 15:34:50 -05002395 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002396 if (ret)
2397 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002398 }
2399
2400 return 0;
2401}
2402
Alex Elder8b3e1a52013-01-24 16:13:36 -06002403static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2404{
2405 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002406 struct rbd_device *rbd_dev;
2407 u64 obj_end;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002408
2409 rbd_assert(img_request_child_test(img_request));
2410
2411 obj_request = img_request->obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002412 rbd_assert(obj_request);
2413 rbd_assert(obj_request->img_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002414
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002415 obj_request->result = img_request->result;
2416 if (obj_request->result)
2417 goto out;
2418
2419 /*
2420 * We need to zero anything beyond the parent overlap
2421 * boundary. Since rbd_img_obj_request_read_callback()
2422 * will zero anything beyond the end of a short read, an
2423 * easy way to do this is to pretend the data from the
2424 * parent came up short--ending at the overlap boundary.
2425 */
2426 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2427 obj_end = obj_request->img_offset + obj_request->length;
2428 rbd_dev = obj_request->img_request->rbd_dev;
2429 if (obj_end > rbd_dev->parent_overlap) {
2430 u64 xferred = 0;
2431
2432 if (obj_request->img_offset < rbd_dev->parent_overlap)
2433 xferred = rbd_dev->parent_overlap -
2434 obj_request->img_offset;
2435
2436 obj_request->xferred = min(img_request->xferred, xferred);
2437 } else {
2438 obj_request->xferred = img_request->xferred;
2439 }
2440out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06002441 rbd_img_obj_request_read_callback(obj_request);
2442 rbd_obj_request_complete(obj_request);
2443}
2444
2445static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2446{
2447 struct rbd_device *rbd_dev;
2448 struct rbd_img_request *img_request;
2449 int result;
2450
2451 rbd_assert(obj_request_img_data_test(obj_request));
2452 rbd_assert(obj_request->img_request != NULL);
2453 rbd_assert(obj_request->result == (s32) -ENOENT);
2454 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2455
2456 rbd_dev = obj_request->img_request->rbd_dev;
2457 rbd_assert(rbd_dev->parent != NULL);
2458 /* rbd_read_finish(obj_request, obj_request->length); */
2459 img_request = rbd_img_request_create(rbd_dev->parent,
2460 obj_request->img_offset,
2461 obj_request->length,
2462 false, true);
2463 result = -ENOMEM;
2464 if (!img_request)
2465 goto out_err;
2466
2467 rbd_obj_request_get(obj_request);
2468 img_request->obj_request = obj_request;
2469
Alex Elderf1a47392013-04-19 15:34:50 -05002470 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2471 obj_request->bio_list);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002472 if (result)
2473 goto out_err;
2474
2475 img_request->callback = rbd_img_parent_read_callback;
2476 result = rbd_img_request_submit(img_request);
2477 if (result)
2478 goto out_err;
2479
2480 return;
2481out_err:
2482 if (img_request)
2483 rbd_img_request_put(img_request);
2484 obj_request->result = result;
2485 obj_request->xferred = 0;
2486 obj_request_done_set(obj_request);
2487}
2488
Alex Eldercf81b602013-01-17 12:18:46 -06002489static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06002490 u64 ver, u64 notify_id)
2491{
2492 struct rbd_obj_request *obj_request;
Alex Elder21692382013-04-05 01:27:12 -05002493 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderb8d70032012-11-30 17:53:04 -06002494 int ret;
2495
2496 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2497 OBJ_REQUEST_NODATA);
2498 if (!obj_request)
2499 return -ENOMEM;
2500
2501 ret = -ENOMEM;
Alex Elder430c28c2013-04-03 21:32:51 -05002502 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002503 if (!obj_request->osd_req)
2504 goto out;
Alex Elder21692382013-04-05 01:27:12 -05002505 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06002506
Alex Elderc99d2d42013-04-05 01:27:11 -05002507 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2508 notify_id, ver, 0);
Alex Elder9d4df012013-04-19 15:34:50 -05002509 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002510
Alex Elderb8d70032012-11-30 17:53:04 -06002511 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002512out:
Alex Eldercf81b602013-01-17 12:18:46 -06002513 if (ret)
2514 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002515
2516 return ret;
2517}
2518
2519static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2520{
2521 struct rbd_device *rbd_dev = (struct rbd_device *)data;
2522 u64 hver;
2523 int rc;
2524
2525 if (!rbd_dev)
2526 return;
2527
Alex Elder37206ee2013-02-20 17:32:08 -06002528 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Elderb8d70032012-11-30 17:53:04 -06002529 rbd_dev->header_name, (unsigned long long) notify_id,
2530 (unsigned int) opcode);
2531 rc = rbd_dev_refresh(rbd_dev, &hver);
2532 if (rc)
2533 rbd_warn(rbd_dev, "got notification but failed to "
2534 " update snaps: %d\n", rc);
2535
Alex Eldercf81b602013-01-17 12:18:46 -06002536 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06002537}
2538
Alex Elder9969ebc2013-01-18 12:31:10 -06002539/*
2540 * Request sync osd watch/unwatch. The value of "start" determines
2541 * whether a watch request is being initiated or torn down.
2542 */
2543static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2544{
2545 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2546 struct rbd_obj_request *obj_request;
Alex Elder9969ebc2013-01-18 12:31:10 -06002547 int ret;
2548
2549 rbd_assert(start ^ !!rbd_dev->watch_event);
2550 rbd_assert(start ^ !!rbd_dev->watch_request);
2551
2552 if (start) {
Alex Elder3c663bb2013-02-15 11:42:30 -06002553 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
Alex Elder9969ebc2013-01-18 12:31:10 -06002554 &rbd_dev->watch_event);
2555 if (ret < 0)
2556 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06002557 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06002558 }
2559
2560 ret = -ENOMEM;
2561 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2562 OBJ_REQUEST_NODATA);
2563 if (!obj_request)
2564 goto out_cancel;
2565
Alex Elder430c28c2013-04-03 21:32:51 -05002566 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2567 if (!obj_request->osd_req)
2568 goto out_cancel;
2569
Alex Elder8eb87562013-01-25 17:08:55 -06002570 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06002571 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06002572 else
Alex Elder6977c3f2013-01-25 17:08:55 -06002573 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06002574 rbd_dev->watch_request->osd_req);
Alex Elder21692382013-04-05 01:27:12 -05002575
2576 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2577 rbd_dev->watch_event->cookie,
2578 rbd_dev->header.obj_version, start);
Alex Elder9d4df012013-04-19 15:34:50 -05002579 rbd_osd_req_format_write(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002580
Alex Elder9969ebc2013-01-18 12:31:10 -06002581 ret = rbd_obj_request_submit(osdc, obj_request);
2582 if (ret)
2583 goto out_cancel;
2584 ret = rbd_obj_request_wait(obj_request);
2585 if (ret)
2586 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06002587 ret = obj_request->result;
2588 if (ret)
2589 goto out_cancel;
2590
Alex Elder8eb87562013-01-25 17:08:55 -06002591 /*
2592 * A watch request is set to linger, so the underlying osd
2593 * request won't go away until we unregister it. We retain
2594 * a pointer to the object request during that time (in
2595 * rbd_dev->watch_request), so we'll keep a reference to
2596 * it. We'll drop that reference (below) after we've
2597 * unregistered it.
2598 */
2599 if (start) {
2600 rbd_dev->watch_request = obj_request;
2601
2602 return 0;
2603 }
2604
2605 /* We have successfully torn down the watch request */
2606
2607 rbd_obj_request_put(rbd_dev->watch_request);
2608 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002609out_cancel:
2610 /* Cancel the event if we're tearing down, or on error */
2611 ceph_osdc_cancel_event(rbd_dev->watch_event);
2612 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002613 if (obj_request)
2614 rbd_obj_request_put(obj_request);
2615
2616 return ret;
2617}
2618
Alex Elder36be9a72013-01-19 00:30:28 -06002619/*
2620 * Synchronous osd object method call
2621 */
2622static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2623 const char *object_name,
2624 const char *class_name,
2625 const char *method_name,
2626 const char *outbound,
2627 size_t outbound_size,
2628 char *inbound,
2629 size_t inbound_size,
2630 u64 *version)
2631{
Alex Elder21692382013-04-05 01:27:12 -05002632 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder36be9a72013-01-19 00:30:28 -06002633 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06002634 struct page **pages;
2635 u32 page_count;
2636 int ret;
2637
2638 /*
Alex Elder6010a452013-04-05 01:27:11 -05002639 * Method calls are ultimately read operations. The result
2640 * should placed into the inbound buffer provided. They
2641 * also supply outbound data--parameters for the object
2642 * method. Currently if this is present it will be a
2643 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06002644 */
2645 page_count = (u32) calc_pages_for(0, inbound_size);
2646 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2647 if (IS_ERR(pages))
2648 return PTR_ERR(pages);
2649
2650 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05002651 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06002652 OBJ_REQUEST_PAGES);
2653 if (!obj_request)
2654 goto out;
2655
2656 obj_request->pages = pages;
2657 obj_request->page_count = page_count;
2658
Alex Elder430c28c2013-04-03 21:32:51 -05002659 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06002660 if (!obj_request->osd_req)
2661 goto out;
2662
Alex Elderc99d2d42013-04-05 01:27:11 -05002663 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05002664 class_name, method_name);
2665 if (outbound_size) {
2666 struct ceph_pagelist *pagelist;
2667
2668 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2669 if (!pagelist)
2670 goto out;
2671
2672 ceph_pagelist_init(pagelist);
2673 ceph_pagelist_append(pagelist, outbound, outbound_size);
2674 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2675 pagelist);
2676 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05002677 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2678 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05002679 0, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002680 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002681
Alex Elder36be9a72013-01-19 00:30:28 -06002682 ret = rbd_obj_request_submit(osdc, obj_request);
2683 if (ret)
2684 goto out;
2685 ret = rbd_obj_request_wait(obj_request);
2686 if (ret)
2687 goto out;
2688
2689 ret = obj_request->result;
2690 if (ret < 0)
2691 goto out;
Alex Elder23ed6e12013-02-06 13:11:38 -06002692 ret = 0;
Alex Elder903bb322013-02-06 13:11:38 -06002693 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06002694 if (version)
2695 *version = obj_request->version;
2696out:
2697 if (obj_request)
2698 rbd_obj_request_put(obj_request);
2699 else
2700 ceph_release_page_vector(pages, page_count);
2701
2702 return ret;
2703}
2704
Alex Elderbf0d5f502012-11-22 00:00:08 -06002705static void rbd_request_fn(struct request_queue *q)
Alex Eldercc344fa2013-02-19 12:25:56 -06002706 __releases(q->queue_lock) __acquires(q->queue_lock)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002707{
2708 struct rbd_device *rbd_dev = q->queuedata;
2709 bool read_only = rbd_dev->mapping.read_only;
2710 struct request *rq;
2711 int result;
2712
2713 while ((rq = blk_fetch_request(q))) {
2714 bool write_request = rq_data_dir(rq) == WRITE;
2715 struct rbd_img_request *img_request;
2716 u64 offset;
2717 u64 length;
2718
2719 /* Ignore any non-FS requests that filter through. */
2720
2721 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06002722 dout("%s: non-fs request type %d\n", __func__,
2723 (int) rq->cmd_type);
2724 __blk_end_request_all(rq, 0);
2725 continue;
2726 }
2727
2728 /* Ignore/skip any zero-length requests */
2729
2730 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2731 length = (u64) blk_rq_bytes(rq);
2732
2733 if (!length) {
2734 dout("%s: zero-length request\n", __func__);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002735 __blk_end_request_all(rq, 0);
2736 continue;
2737 }
2738
2739 spin_unlock_irq(q->queue_lock);
2740
2741 /* Disallow writes to a read-only device */
2742
2743 if (write_request) {
2744 result = -EROFS;
2745 if (read_only)
2746 goto end_request;
2747 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2748 }
2749
Alex Elder6d292902013-01-14 12:43:31 -06002750 /*
2751 * Quit early if the mapped snapshot no longer
2752 * exists. It's still possible the snapshot will
2753 * have disappeared by the time our request arrives
2754 * at the osd, but there's no sense in sending it if
2755 * we already know.
2756 */
2757 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002758 dout("request for non-existent snapshot");
2759 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2760 result = -ENXIO;
2761 goto end_request;
2762 }
2763
Alex Elderbf0d5f502012-11-22 00:00:08 -06002764 result = -EINVAL;
2765 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2766 goto end_request; /* Shouldn't happen */
2767
2768 result = -ENOMEM;
2769 img_request = rbd_img_request_create(rbd_dev, offset, length,
Alex Elder9849e982013-01-24 16:13:36 -06002770 write_request, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002771 if (!img_request)
2772 goto end_request;
2773
2774 img_request->rq = rq;
2775
Alex Elderf1a47392013-04-19 15:34:50 -05002776 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2777 rq->bio);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002778 if (!result)
2779 result = rbd_img_request_submit(img_request);
2780 if (result)
2781 rbd_img_request_put(img_request);
2782end_request:
2783 spin_lock_irq(q->queue_lock);
2784 if (result < 0) {
Alex Elder7da22d22013-01-24 16:13:36 -06002785 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2786 write_request ? "write" : "read",
2787 length, offset, result);
2788
Alex Elderbf0d5f502012-11-22 00:00:08 -06002789 __blk_end_request_all(rq, result);
2790 }
2791 }
2792}
2793
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002794/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002795 * a queue callback. Makes sure that we don't create a bio that spans across
2796 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002797 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002798 */
2799static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2800 struct bio_vec *bvec)
2801{
2802 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05002803 sector_t sector_offset;
2804 sector_t sectors_per_obj;
2805 sector_t obj_sector_offset;
2806 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002807
Alex Eldere5cfeed22012-10-20 22:17:27 -05002808 /*
2809 * Find how far into its rbd object the partition-relative
2810 * bio start sector is to offset relative to the enclosing
2811 * device.
2812 */
2813 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2814 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2815 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002816
Alex Eldere5cfeed22012-10-20 22:17:27 -05002817 /*
2818 * Compute the number of bytes from that offset to the end
2819 * of the object. Account for what's already used by the bio.
2820 */
2821 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2822 if (ret > bmd->bi_size)
2823 ret -= bmd->bi_size;
2824 else
2825 ret = 0;
2826
2827 /*
2828 * Don't send back more than was asked for. And if the bio
2829 * was empty, let the whole thing through because: "Note
2830 * that a block device *must* allow a single page to be
2831 * added to an empty bio."
2832 */
2833 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2834 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2835 ret = (int) bvec->bv_len;
2836
2837 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002838}
2839
2840static void rbd_free_disk(struct rbd_device *rbd_dev)
2841{
2842 struct gendisk *disk = rbd_dev->disk;
2843
2844 if (!disk)
2845 return;
2846
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002847 if (disk->flags & GENHD_FL_UP)
2848 del_gendisk(disk);
2849 if (disk->queue)
2850 blk_cleanup_queue(disk->queue);
2851 put_disk(disk);
2852}
2853
Alex Elder788e2df2013-01-17 12:25:27 -06002854static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2855 const char *object_name,
2856 u64 offset, u64 length,
2857 char *buf, u64 *version)
2858
2859{
Alex Elder21692382013-04-05 01:27:12 -05002860 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder788e2df2013-01-17 12:25:27 -06002861 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06002862 struct page **pages = NULL;
2863 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002864 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06002865 int ret;
2866
2867 page_count = (u32) calc_pages_for(offset, length);
2868 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2869 if (IS_ERR(pages))
2870 ret = PTR_ERR(pages);
2871
2872 ret = -ENOMEM;
2873 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002874 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002875 if (!obj_request)
2876 goto out;
2877
2878 obj_request->pages = pages;
2879 obj_request->page_count = page_count;
2880
Alex Elder430c28c2013-04-03 21:32:51 -05002881 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06002882 if (!obj_request->osd_req)
2883 goto out;
2884
Alex Elderc99d2d42013-04-05 01:27:11 -05002885 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2886 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05002887 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05002888 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05002889 obj_request->length,
2890 obj_request->offset & ~PAGE_MASK,
2891 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002892 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002893
Alex Elder788e2df2013-01-17 12:25:27 -06002894 ret = rbd_obj_request_submit(osdc, obj_request);
2895 if (ret)
2896 goto out;
2897 ret = rbd_obj_request_wait(obj_request);
2898 if (ret)
2899 goto out;
2900
2901 ret = obj_request->result;
2902 if (ret < 0)
2903 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002904
2905 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2906 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002907 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder23ed6e12013-02-06 13:11:38 -06002908 rbd_assert(size <= (size_t) INT_MAX);
2909 ret = (int) size;
Alex Elder788e2df2013-01-17 12:25:27 -06002910 if (version)
2911 *version = obj_request->version;
2912out:
2913 if (obj_request)
2914 rbd_obj_request_put(obj_request);
2915 else
2916 ceph_release_page_vector(pages, page_count);
2917
2918 return ret;
2919}
2920
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002921/*
Alex Elder4156d992012-08-02 11:29:46 -05002922 * Read the complete header for the given rbd device.
2923 *
2924 * Returns a pointer to a dynamically-allocated buffer containing
2925 * the complete and validated header. Caller can pass the address
2926 * of a variable that will be filled in with the version of the
2927 * header object at the time it was read.
2928 *
2929 * Returns a pointer-coded errno if a failure occurs.
2930 */
2931static struct rbd_image_header_ondisk *
2932rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2933{
2934 struct rbd_image_header_ondisk *ondisk = NULL;
2935 u32 snap_count = 0;
2936 u64 names_size = 0;
2937 u32 want_count;
2938 int ret;
2939
2940 /*
2941 * The complete header will include an array of its 64-bit
2942 * snapshot ids, followed by the names of those snapshots as
2943 * a contiguous block of NUL-terminated strings. Note that
2944 * the number of snapshots could change by the time we read
2945 * it in, in which case we re-read it.
2946 */
2947 do {
2948 size_t size;
2949
2950 kfree(ondisk);
2951
2952 size = sizeof (*ondisk);
2953 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2954 size += names_size;
2955 ondisk = kmalloc(size, GFP_KERNEL);
2956 if (!ondisk)
2957 return ERR_PTR(-ENOMEM);
2958
Alex Elder788e2df2013-01-17 12:25:27 -06002959 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002960 0, size,
2961 (char *) ondisk, version);
Alex Elder4156d992012-08-02 11:29:46 -05002962 if (ret < 0)
2963 goto out_err;
2964 if (WARN_ON((size_t) ret < size)) {
2965 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002966 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2967 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002968 goto out_err;
2969 }
2970 if (!rbd_dev_ondisk_valid(ondisk)) {
2971 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002972 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002973 goto out_err;
2974 }
2975
2976 names_size = le64_to_cpu(ondisk->snap_names_len);
2977 want_count = snap_count;
2978 snap_count = le32_to_cpu(ondisk->snap_count);
2979 } while (snap_count != want_count);
2980
2981 return ondisk;
2982
2983out_err:
2984 kfree(ondisk);
2985
2986 return ERR_PTR(ret);
2987}
2988
2989/*
2990 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002991 */
2992static int rbd_read_header(struct rbd_device *rbd_dev,
2993 struct rbd_image_header *header)
2994{
Alex Elder4156d992012-08-02 11:29:46 -05002995 struct rbd_image_header_ondisk *ondisk;
2996 u64 ver = 0;
2997 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002998
Alex Elder4156d992012-08-02 11:29:46 -05002999 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
3000 if (IS_ERR(ondisk))
3001 return PTR_ERR(ondisk);
3002 ret = rbd_header_from_disk(header, ondisk);
3003 if (ret >= 0)
3004 header->obj_version = ver;
3005 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003006
Alex Elder4156d992012-08-02 11:29:46 -05003007 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003008}
3009
Alex Elder41f38c22012-10-25 23:34:40 -05003010static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003011{
3012 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05003013 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003014
Alex Eldera0593292012-07-19 09:09:27 -05003015 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05003016 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003017}
3018
Alex Elder94785542012-10-09 13:50:17 -07003019static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
3020{
3021 sector_t size;
3022
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003023 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07003024 return;
3025
3026 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
3027 dout("setting size to %llu sectors", (unsigned long long) size);
3028 rbd_dev->mapping.size = (u64) size;
3029 set_capacity(rbd_dev->disk, size);
3030}
3031
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003032/*
3033 * only read the first part of the ondisk header, without the snaps info
3034 */
Alex Elder117973f2012-08-31 17:29:55 -05003035static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003036{
3037 int ret;
3038 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003039
3040 ret = rbd_read_header(rbd_dev, &h);
3041 if (ret < 0)
3042 return ret;
3043
Josh Durgina51aa0c2011-12-05 10:35:04 -08003044 down_write(&rbd_dev->header_rwsem);
3045
Alex Elder94785542012-10-09 13:50:17 -07003046 /* Update image size, and check for resize of mapped image */
3047 rbd_dev->header.image_size = h.image_size;
3048 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07003049
Alex Elder849b4262012-07-09 21:04:24 -05003050 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003051 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05003052 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08003053 /* osd requests may still refer to snapc */
3054 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003055
Alex Elderb8136232012-07-25 09:32:41 -05003056 if (hver)
3057 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08003058 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08003059 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003060 rbd_dev->header.snapc = h.snapc;
3061 rbd_dev->header.snap_names = h.snap_names;
3062 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05003063 /* Free the extra copy of the object prefix */
3064 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
3065 kfree(h.object_prefix);
3066
Alex Elder304f6802012-08-31 17:29:52 -05003067 ret = rbd_dev_snaps_update(rbd_dev);
3068 if (!ret)
3069 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003070
Josh Durginc6666012011-11-21 17:11:12 -08003071 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003072
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003073 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003074}
3075
Alex Elder117973f2012-08-31 17:29:55 -05003076static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05003077{
3078 int ret;
3079
Alex Elder117973f2012-08-31 17:29:55 -05003080 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05003081 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05003082 if (rbd_dev->image_format == 1)
3083 ret = rbd_dev_v1_refresh(rbd_dev, hver);
3084 else
3085 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05003086 mutex_unlock(&ctl_mutex);
Laurent Barbed98df632013-04-10 17:47:46 -05003087 revalidate_disk(rbd_dev->disk);
Alex Elder1fe5e992012-07-25 09:32:41 -05003088
3089 return ret;
3090}
3091
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003092static int rbd_init_disk(struct rbd_device *rbd_dev)
3093{
3094 struct gendisk *disk;
3095 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003096 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003097
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003098 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003099 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3100 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003101 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003102
Alex Elderf0f8cef2012-01-29 13:57:44 -06003103 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003104 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003105 disk->major = rbd_dev->major;
3106 disk->first_minor = 0;
3107 disk->fops = &rbd_bd_ops;
3108 disk->private_data = rbd_dev;
3109
Alex Elderbf0d5f502012-11-22 00:00:08 -06003110 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003111 if (!q)
3112 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003113
Alex Elder593a9e72012-02-07 12:03:37 -06003114 /* We use the default size, but let's be explicit about it. */
3115 blk_queue_physical_block_size(q, SECTOR_SIZE);
3116
Josh Durgin029bcbd2011-07-22 11:35:23 -07003117 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003118 segment_size = rbd_obj_bytes(&rbd_dev->header);
3119 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3120 blk_queue_max_segment_size(q, segment_size);
3121 blk_queue_io_min(q, segment_size);
3122 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003123
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003124 blk_queue_merge_bvec(q, rbd_merge_bvec);
3125 disk->queue = q;
3126
3127 q->queuedata = rbd_dev;
3128
3129 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003130
Alex Elder12f02942012-08-29 17:11:07 -05003131 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
3132
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003133 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003134out_disk:
3135 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003136
3137 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003138}
3139
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003140/*
3141 sysfs
3142*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003143
Alex Elder593a9e72012-02-07 12:03:37 -06003144static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3145{
3146 return container_of(dev, struct rbd_device, dev);
3147}
3148
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003149static ssize_t rbd_size_show(struct device *dev,
3150 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003151{
Alex Elder593a9e72012-02-07 12:03:37 -06003152 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08003153 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003154
Josh Durgina51aa0c2011-12-05 10:35:04 -08003155 down_read(&rbd_dev->header_rwsem);
3156 size = get_capacity(rbd_dev->disk);
3157 up_read(&rbd_dev->header_rwsem);
3158
3159 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003160}
3161
Alex Elder34b13182012-07-13 20:35:12 -05003162/*
3163 * Note this shows the features for whatever's mapped, which is not
3164 * necessarily the base image.
3165 */
3166static ssize_t rbd_features_show(struct device *dev,
3167 struct device_attribute *attr, char *buf)
3168{
3169 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3170
3171 return sprintf(buf, "0x%016llx\n",
3172 (unsigned long long) rbd_dev->mapping.features);
3173}
3174
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003175static ssize_t rbd_major_show(struct device *dev,
3176 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003177{
Alex Elder593a9e72012-02-07 12:03:37 -06003178 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003179
3180 return sprintf(buf, "%d\n", rbd_dev->major);
3181}
3182
3183static ssize_t rbd_client_id_show(struct device *dev,
3184 struct device_attribute *attr, char *buf)
3185{
Alex Elder593a9e72012-02-07 12:03:37 -06003186 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003187
Alex Elder1dbb4392012-01-24 10:08:37 -06003188 return sprintf(buf, "client%lld\n",
3189 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003190}
3191
3192static ssize_t rbd_pool_show(struct device *dev,
3193 struct device_attribute *attr, char *buf)
3194{
Alex Elder593a9e72012-02-07 12:03:37 -06003195 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003196
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003197 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003198}
3199
Alex Elder9bb2f332012-07-12 10:46:35 -05003200static ssize_t rbd_pool_id_show(struct device *dev,
3201 struct device_attribute *attr, char *buf)
3202{
3203 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3204
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003205 return sprintf(buf, "%llu\n",
3206 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003207}
3208
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003209static ssize_t rbd_name_show(struct device *dev,
3210 struct device_attribute *attr, char *buf)
3211{
Alex Elder593a9e72012-02-07 12:03:37 -06003212 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003213
Alex Eldera92ffdf2012-10-30 19:40:33 -05003214 if (rbd_dev->spec->image_name)
3215 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3216
3217 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003218}
3219
Alex Elder589d30e2012-07-10 20:30:11 -05003220static ssize_t rbd_image_id_show(struct device *dev,
3221 struct device_attribute *attr, char *buf)
3222{
3223 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3224
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003225 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003226}
3227
Alex Elder34b13182012-07-13 20:35:12 -05003228/*
3229 * Shows the name of the currently-mapped snapshot (or
3230 * RBD_SNAP_HEAD_NAME for the base image).
3231 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003232static ssize_t rbd_snap_show(struct device *dev,
3233 struct device_attribute *attr,
3234 char *buf)
3235{
Alex Elder593a9e72012-02-07 12:03:37 -06003236 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003237
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003238 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003239}
3240
Alex Elder86b00e02012-10-25 23:34:42 -05003241/*
3242 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3243 * for the parent image. If there is no parent, simply shows
3244 * "(no parent image)".
3245 */
3246static ssize_t rbd_parent_show(struct device *dev,
3247 struct device_attribute *attr,
3248 char *buf)
3249{
3250 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3251 struct rbd_spec *spec = rbd_dev->parent_spec;
3252 int count;
3253 char *bufp = buf;
3254
3255 if (!spec)
3256 return sprintf(buf, "(no parent image)\n");
3257
3258 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3259 (unsigned long long) spec->pool_id, spec->pool_name);
3260 if (count < 0)
3261 return count;
3262 bufp += count;
3263
3264 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3265 spec->image_name ? spec->image_name : "(unknown)");
3266 if (count < 0)
3267 return count;
3268 bufp += count;
3269
3270 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3271 (unsigned long long) spec->snap_id, spec->snap_name);
3272 if (count < 0)
3273 return count;
3274 bufp += count;
3275
3276 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3277 if (count < 0)
3278 return count;
3279 bufp += count;
3280
3281 return (ssize_t) (bufp - buf);
3282}
3283
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003284static ssize_t rbd_image_refresh(struct device *dev,
3285 struct device_attribute *attr,
3286 const char *buf,
3287 size_t size)
3288{
Alex Elder593a9e72012-02-07 12:03:37 -06003289 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003290 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003291
Alex Elder117973f2012-08-31 17:29:55 -05003292 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05003293
3294 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003295}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003296
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003297static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003298static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003299static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3300static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3301static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003302static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003303static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003304static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003305static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3306static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003307static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003308
3309static struct attribute *rbd_attrs[] = {
3310 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003311 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003312 &dev_attr_major.attr,
3313 &dev_attr_client_id.attr,
3314 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05003315 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003316 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05003317 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003318 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05003319 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003320 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003321 NULL
3322};
3323
3324static struct attribute_group rbd_attr_group = {
3325 .attrs = rbd_attrs,
3326};
3327
3328static const struct attribute_group *rbd_attr_groups[] = {
3329 &rbd_attr_group,
3330 NULL
3331};
3332
3333static void rbd_sysfs_dev_release(struct device *dev)
3334{
3335}
3336
3337static struct device_type rbd_device_type = {
3338 .name = "rbd",
3339 .groups = rbd_attr_groups,
3340 .release = rbd_sysfs_dev_release,
3341};
3342
3343
3344/*
3345 sysfs - snapshots
3346*/
3347
3348static ssize_t rbd_snap_size_show(struct device *dev,
3349 struct device_attribute *attr,
3350 char *buf)
3351{
3352 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3353
Josh Durgin35915382011-12-05 18:25:13 -08003354 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003355}
3356
3357static ssize_t rbd_snap_id_show(struct device *dev,
3358 struct device_attribute *attr,
3359 char *buf)
3360{
3361 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3362
Josh Durgin35915382011-12-05 18:25:13 -08003363 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003364}
3365
Alex Elder34b13182012-07-13 20:35:12 -05003366static ssize_t rbd_snap_features_show(struct device *dev,
3367 struct device_attribute *attr,
3368 char *buf)
3369{
3370 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3371
3372 return sprintf(buf, "0x%016llx\n",
3373 (unsigned long long) snap->features);
3374}
3375
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003376static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
3377static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003378static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003379
3380static struct attribute *rbd_snap_attrs[] = {
3381 &dev_attr_snap_size.attr,
3382 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003383 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003384 NULL,
3385};
3386
3387static struct attribute_group rbd_snap_attr_group = {
3388 .attrs = rbd_snap_attrs,
3389};
3390
3391static void rbd_snap_dev_release(struct device *dev)
3392{
3393 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3394 kfree(snap->name);
3395 kfree(snap);
3396}
3397
3398static const struct attribute_group *rbd_snap_attr_groups[] = {
3399 &rbd_snap_attr_group,
3400 NULL
3401};
3402
3403static struct device_type rbd_snap_device_type = {
3404 .groups = rbd_snap_attr_groups,
3405 .release = rbd_snap_dev_release,
3406};
3407
Alex Elder8b8fb992012-10-26 17:25:24 -05003408static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3409{
3410 kref_get(&spec->kref);
3411
3412 return spec;
3413}
3414
3415static void rbd_spec_free(struct kref *kref);
3416static void rbd_spec_put(struct rbd_spec *spec)
3417{
3418 if (spec)
3419 kref_put(&spec->kref, rbd_spec_free);
3420}
3421
3422static struct rbd_spec *rbd_spec_alloc(void)
3423{
3424 struct rbd_spec *spec;
3425
3426 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3427 if (!spec)
3428 return NULL;
3429 kref_init(&spec->kref);
3430
Alex Elder8b8fb992012-10-26 17:25:24 -05003431 return spec;
3432}
3433
3434static void rbd_spec_free(struct kref *kref)
3435{
3436 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3437
3438 kfree(spec->pool_name);
3439 kfree(spec->image_id);
3440 kfree(spec->image_name);
3441 kfree(spec->snap_name);
3442 kfree(spec);
3443}
3444
Alex Eldercc344fa2013-02-19 12:25:56 -06003445static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05003446 struct rbd_spec *spec)
3447{
3448 struct rbd_device *rbd_dev;
3449
3450 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3451 if (!rbd_dev)
3452 return NULL;
3453
3454 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06003455 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05003456 INIT_LIST_HEAD(&rbd_dev->node);
3457 INIT_LIST_HEAD(&rbd_dev->snaps);
3458 init_rwsem(&rbd_dev->header_rwsem);
3459
3460 rbd_dev->spec = spec;
3461 rbd_dev->rbd_client = rbdc;
3462
Alex Elder0903e872012-11-14 12:25:19 -06003463 /* Initialize the layout used for all rbd requests */
3464
3465 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3466 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3467 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3468 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3469
Alex Elderc53d5892012-10-25 23:34:42 -05003470 return rbd_dev;
3471}
3472
3473static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3474{
Alex Elder86b00e02012-10-25 23:34:42 -05003475 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05003476 kfree(rbd_dev->header_name);
3477 rbd_put_client(rbd_dev->rbd_client);
3478 rbd_spec_put(rbd_dev->spec);
3479 kfree(rbd_dev);
3480}
3481
Alex Elder304f6802012-08-31 17:29:52 -05003482static bool rbd_snap_registered(struct rbd_snap *snap)
3483{
3484 bool ret = snap->dev.type == &rbd_snap_device_type;
3485 bool reg = device_is_registered(&snap->dev);
3486
3487 rbd_assert(!ret ^ reg);
3488
3489 return ret;
3490}
3491
Alex Elder41f38c22012-10-25 23:34:40 -05003492static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003493{
3494 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05003495 if (device_is_registered(&snap->dev))
3496 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003497}
3498
Alex Elder14e70852012-07-19 09:09:27 -05003499static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003500 struct device *parent)
3501{
3502 struct device *dev = &snap->dev;
3503 int ret;
3504
3505 dev->type = &rbd_snap_device_type;
3506 dev->parent = parent;
3507 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05003508 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05003509 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
3510
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003511 ret = device_register(dev);
3512
3513 return ret;
3514}
3515
Alex Elder4e891e02012-07-10 20:30:10 -05003516static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05003517 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05003518 u64 snap_id, u64 snap_size,
3519 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003520{
Alex Elder4e891e02012-07-10 20:30:10 -05003521 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003522 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05003523
3524 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003525 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05003526 return ERR_PTR(-ENOMEM);
3527
3528 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05003529 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05003530 if (!snap->name)
3531 goto err;
3532
Alex Elderc8d18422012-07-10 20:30:11 -05003533 snap->id = snap_id;
3534 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05003535 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05003536
3537 return snap;
3538
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003539err:
3540 kfree(snap->name);
3541 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05003542
3543 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003544}
3545
Alex Eldercd892122012-07-03 16:01:19 -05003546static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
3547 u64 *snap_size, u64 *snap_features)
3548{
3549 char *snap_name;
3550
3551 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3552
3553 *snap_size = rbd_dev->header.snap_sizes[which];
3554 *snap_features = 0; /* No features for v1 */
3555
3556 /* Skip over names until we find the one we are looking for */
3557
3558 snap_name = rbd_dev->header.snap_names;
3559 while (which--)
3560 snap_name += strlen(snap_name) + 1;
3561
3562 return snap_name;
3563}
3564
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003565/*
Alex Elder9d475de2012-07-03 16:01:19 -05003566 * Get the size and object order for an image snapshot, or if
3567 * snap_id is CEPH_NOSNAP, gets this information for the base
3568 * image.
3569 */
3570static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3571 u8 *order, u64 *snap_size)
3572{
3573 __le64 snapid = cpu_to_le64(snap_id);
3574 int ret;
3575 struct {
3576 u8 order;
3577 __le64 size;
3578 } __attribute__ ((packed)) size_buf = { 0 };
3579
Alex Elder36be9a72013-01-19 00:30:28 -06003580 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05003581 "rbd", "get_size",
3582 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06003583 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003584 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05003585 if (ret < 0)
3586 return ret;
3587
3588 *order = size_buf.order;
3589 *snap_size = le64_to_cpu(size_buf.size);
3590
3591 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
3592 (unsigned long long) snap_id, (unsigned int) *order,
3593 (unsigned long long) *snap_size);
3594
3595 return 0;
3596}
3597
3598static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3599{
3600 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3601 &rbd_dev->header.obj_order,
3602 &rbd_dev->header.image_size);
3603}
3604
Alex Elder1e130192012-07-03 16:01:19 -05003605static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3606{
3607 void *reply_buf;
3608 int ret;
3609 void *p;
3610
3611 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3612 if (!reply_buf)
3613 return -ENOMEM;
3614
Alex Elder36be9a72013-01-19 00:30:28 -06003615 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder1e130192012-07-03 16:01:19 -05003616 "rbd", "get_object_prefix",
3617 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003618 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003619 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05003620 if (ret < 0)
3621 goto out;
3622
3623 p = reply_buf;
3624 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3625 p + RBD_OBJ_PREFIX_LEN_MAX,
3626 NULL, GFP_NOIO);
3627
3628 if (IS_ERR(rbd_dev->header.object_prefix)) {
3629 ret = PTR_ERR(rbd_dev->header.object_prefix);
3630 rbd_dev->header.object_prefix = NULL;
3631 } else {
3632 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
3633 }
3634
3635out:
3636 kfree(reply_buf);
3637
3638 return ret;
3639}
3640
Alex Elderb1b54022012-07-03 16:01:19 -05003641static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3642 u64 *snap_features)
3643{
3644 __le64 snapid = cpu_to_le64(snap_id);
3645 struct {
3646 __le64 features;
3647 __le64 incompat;
3648 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07003649 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05003650 int ret;
3651
Alex Elder36be9a72013-01-19 00:30:28 -06003652 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05003653 "rbd", "get_features",
3654 (char *) &snapid, sizeof (snapid),
3655 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06003656 NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003657 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05003658 if (ret < 0)
3659 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07003660
3661 incompat = le64_to_cpu(features_buf.incompat);
Alex Elder5cbf6f122013-04-11 09:29:48 -05003662 if (incompat & ~RBD_FEATURES_SUPPORTED)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05003663 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07003664
Alex Elderb1b54022012-07-03 16:01:19 -05003665 *snap_features = le64_to_cpu(features_buf.features);
3666
3667 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3668 (unsigned long long) snap_id,
3669 (unsigned long long) *snap_features,
3670 (unsigned long long) le64_to_cpu(features_buf.incompat));
3671
3672 return 0;
3673}
3674
3675static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3676{
3677 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3678 &rbd_dev->header.features);
3679}
3680
Alex Elder86b00e02012-10-25 23:34:42 -05003681static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3682{
3683 struct rbd_spec *parent_spec;
3684 size_t size;
3685 void *reply_buf = NULL;
3686 __le64 snapid;
3687 void *p;
3688 void *end;
3689 char *image_id;
3690 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05003691 int ret;
3692
3693 parent_spec = rbd_spec_alloc();
3694 if (!parent_spec)
3695 return -ENOMEM;
3696
3697 size = sizeof (__le64) + /* pool_id */
3698 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
3699 sizeof (__le64) + /* snap_id */
3700 sizeof (__le64); /* overlap */
3701 reply_buf = kmalloc(size, GFP_KERNEL);
3702 if (!reply_buf) {
3703 ret = -ENOMEM;
3704 goto out_err;
3705 }
3706
3707 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06003708 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05003709 "rbd", "get_parent",
3710 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06003711 (char *) reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003712 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05003713 if (ret < 0)
3714 goto out_err;
3715
3716 ret = -ERANGE;
3717 p = reply_buf;
3718 end = (char *) reply_buf + size;
3719 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3720 if (parent_spec->pool_id == CEPH_NOPOOL)
3721 goto out; /* No parent? No problem. */
3722
Alex Elder0903e872012-11-14 12:25:19 -06003723 /* The ceph file layout needs to fit pool id in 32 bits */
3724
3725 ret = -EIO;
3726 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
3727 goto out;
3728
Alex Elder979ed482012-11-01 08:39:26 -05003729 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05003730 if (IS_ERR(image_id)) {
3731 ret = PTR_ERR(image_id);
3732 goto out_err;
3733 }
3734 parent_spec->image_id = image_id;
3735 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3736 ceph_decode_64_safe(&p, end, overlap, out_err);
3737
3738 rbd_dev->parent_overlap = overlap;
3739 rbd_dev->parent_spec = parent_spec;
3740 parent_spec = NULL; /* rbd_dev now owns this */
3741out:
3742 ret = 0;
3743out_err:
3744 kfree(reply_buf);
3745 rbd_spec_put(parent_spec);
3746
3747 return ret;
3748}
3749
Alex Elder9e15b772012-10-30 19:40:33 -05003750static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3751{
3752 size_t image_id_size;
3753 char *image_id;
3754 void *p;
3755 void *end;
3756 size_t size;
3757 void *reply_buf = NULL;
3758 size_t len = 0;
3759 char *image_name = NULL;
3760 int ret;
3761
3762 rbd_assert(!rbd_dev->spec->image_name);
3763
Alex Elder69e7a022012-11-01 08:39:26 -05003764 len = strlen(rbd_dev->spec->image_id);
3765 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003766 image_id = kmalloc(image_id_size, GFP_KERNEL);
3767 if (!image_id)
3768 return NULL;
3769
3770 p = image_id;
3771 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05003772 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05003773
3774 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3775 reply_buf = kmalloc(size, GFP_KERNEL);
3776 if (!reply_buf)
3777 goto out;
3778
Alex Elder36be9a72013-01-19 00:30:28 -06003779 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003780 "rbd", "dir_get_name",
3781 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06003782 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05003783 if (ret < 0)
3784 goto out;
3785 p = reply_buf;
3786 end = (char *) reply_buf + size;
3787 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3788 if (IS_ERR(image_name))
3789 image_name = NULL;
3790 else
3791 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3792out:
3793 kfree(reply_buf);
3794 kfree(image_id);
3795
3796 return image_name;
3797}
3798
3799/*
3800 * When a parent image gets probed, we only have the pool, image,
3801 * and snapshot ids but not the names of any of them. This call
3802 * is made later to fill in those names. It has to be done after
3803 * rbd_dev_snaps_update() has completed because some of the
3804 * information (in particular, snapshot name) is not available
3805 * until then.
3806 */
3807static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3808{
3809 struct ceph_osd_client *osdc;
3810 const char *name;
3811 void *reply_buf = NULL;
3812 int ret;
3813
3814 if (rbd_dev->spec->pool_name)
3815 return 0; /* Already have the names */
3816
3817 /* Look up the pool name */
3818
3819 osdc = &rbd_dev->rbd_client->client->osdc;
3820 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003821 if (!name) {
3822 rbd_warn(rbd_dev, "there is no pool with id %llu",
3823 rbd_dev->spec->pool_id); /* Really a BUG() */
3824 return -EIO;
3825 }
Alex Elder9e15b772012-10-30 19:40:33 -05003826
3827 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3828 if (!rbd_dev->spec->pool_name)
3829 return -ENOMEM;
3830
3831 /* Fetch the image name; tolerate failure here */
3832
3833 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05003834 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05003835 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05003836 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05003837 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003838
3839 /* Look up the snapshot name. */
3840
3841 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3842 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003843 rbd_warn(rbd_dev, "no snapshot with id %llu",
3844 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003845 ret = -EIO;
3846 goto out_err;
3847 }
3848 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3849 if(!rbd_dev->spec->snap_name)
3850 goto out_err;
3851
3852 return 0;
3853out_err:
3854 kfree(reply_buf);
3855 kfree(rbd_dev->spec->pool_name);
3856 rbd_dev->spec->pool_name = NULL;
3857
3858 return ret;
3859}
3860
Alex Elder6e14b1a2012-07-03 16:01:19 -05003861static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003862{
3863 size_t size;
3864 int ret;
3865 void *reply_buf;
3866 void *p;
3867 void *end;
3868 u64 seq;
3869 u32 snap_count;
3870 struct ceph_snap_context *snapc;
3871 u32 i;
3872
3873 /*
3874 * We'll need room for the seq value (maximum snapshot id),
3875 * snapshot count, and array of that many snapshot ids.
3876 * For now we have a fixed upper limit on the number we're
3877 * prepared to receive.
3878 */
3879 size = sizeof (__le64) + sizeof (__le32) +
3880 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3881 reply_buf = kzalloc(size, GFP_KERNEL);
3882 if (!reply_buf)
3883 return -ENOMEM;
3884
Alex Elder36be9a72013-01-19 00:30:28 -06003885 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder35d489f2012-07-03 16:01:19 -05003886 "rbd", "get_snapcontext",
3887 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003888 reply_buf, size, ver);
Alex Elder36be9a72013-01-19 00:30:28 -06003889 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003890 if (ret < 0)
3891 goto out;
3892
3893 ret = -ERANGE;
3894 p = reply_buf;
3895 end = (char *) reply_buf + size;
3896 ceph_decode_64_safe(&p, end, seq, out);
3897 ceph_decode_32_safe(&p, end, snap_count, out);
3898
3899 /*
3900 * Make sure the reported number of snapshot ids wouldn't go
3901 * beyond the end of our buffer. But before checking that,
3902 * make sure the computed size of the snapshot context we
3903 * allocate is representable in a size_t.
3904 */
3905 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3906 / sizeof (u64)) {
3907 ret = -EINVAL;
3908 goto out;
3909 }
3910 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3911 goto out;
3912
3913 size = sizeof (struct ceph_snap_context) +
3914 snap_count * sizeof (snapc->snaps[0]);
3915 snapc = kmalloc(size, GFP_KERNEL);
3916 if (!snapc) {
3917 ret = -ENOMEM;
3918 goto out;
3919 }
3920
3921 atomic_set(&snapc->nref, 1);
3922 snapc->seq = seq;
3923 snapc->num_snaps = snap_count;
3924 for (i = 0; i < snap_count; i++)
3925 snapc->snaps[i] = ceph_decode_64(&p);
3926
3927 rbd_dev->header.snapc = snapc;
3928
3929 dout(" snap context seq = %llu, snap_count = %u\n",
3930 (unsigned long long) seq, (unsigned int) snap_count);
3931
3932out:
3933 kfree(reply_buf);
3934
3935 return 0;
3936}
3937
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003938static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3939{
3940 size_t size;
3941 void *reply_buf;
3942 __le64 snap_id;
3943 int ret;
3944 void *p;
3945 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003946 char *snap_name;
3947
3948 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3949 reply_buf = kmalloc(size, GFP_KERNEL);
3950 if (!reply_buf)
3951 return ERR_PTR(-ENOMEM);
3952
3953 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003954 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003955 "rbd", "get_snapshot_name",
3956 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003957 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003958 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003959 if (ret < 0)
3960 goto out;
3961
3962 p = reply_buf;
3963 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003964 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003965 if (IS_ERR(snap_name)) {
3966 ret = PTR_ERR(snap_name);
3967 goto out;
3968 } else {
3969 dout(" snap_id 0x%016llx snap_name = %s\n",
3970 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3971 }
3972 kfree(reply_buf);
3973
3974 return snap_name;
3975out:
3976 kfree(reply_buf);
3977
3978 return ERR_PTR(ret);
3979}
3980
3981static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3982 u64 *snap_size, u64 *snap_features)
3983{
Alex Eldere0b49862013-01-09 14:44:18 -06003984 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003985 u8 order;
3986 int ret;
3987
3988 snap_id = rbd_dev->header.snapc->snaps[which];
3989 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3990 if (ret)
3991 return ERR_PTR(ret);
3992 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3993 if (ret)
3994 return ERR_PTR(ret);
3995
3996 return rbd_dev_v2_snap_name(rbd_dev, which);
3997}
3998
3999static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
4000 u64 *snap_size, u64 *snap_features)
4001{
4002 if (rbd_dev->image_format == 1)
4003 return rbd_dev_v1_snap_info(rbd_dev, which,
4004 snap_size, snap_features);
4005 if (rbd_dev->image_format == 2)
4006 return rbd_dev_v2_snap_info(rbd_dev, which,
4007 snap_size, snap_features);
4008 return ERR_PTR(-EINVAL);
4009}
4010
Alex Elder117973f2012-08-31 17:29:55 -05004011static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
4012{
4013 int ret;
4014 __u8 obj_order;
4015
4016 down_write(&rbd_dev->header_rwsem);
4017
4018 /* Grab old order first, to see if it changes */
4019
4020 obj_order = rbd_dev->header.obj_order,
4021 ret = rbd_dev_v2_image_size(rbd_dev);
4022 if (ret)
4023 goto out;
4024 if (rbd_dev->header.obj_order != obj_order) {
4025 ret = -EIO;
4026 goto out;
4027 }
4028 rbd_update_mapping_size(rbd_dev);
4029
4030 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
4031 dout("rbd_dev_v2_snap_context returned %d\n", ret);
4032 if (ret)
4033 goto out;
4034 ret = rbd_dev_snaps_update(rbd_dev);
4035 dout("rbd_dev_snaps_update returned %d\n", ret);
4036 if (ret)
4037 goto out;
4038 ret = rbd_dev_snaps_register(rbd_dev);
4039 dout("rbd_dev_snaps_register returned %d\n", ret);
4040out:
4041 up_write(&rbd_dev->header_rwsem);
4042
4043 return ret;
4044}
4045
Alex Elder9d475de2012-07-03 16:01:19 -05004046/*
Alex Elder35938152012-08-02 11:29:46 -05004047 * Scan the rbd device's current snapshot list and compare it to the
4048 * newly-received snapshot context. Remove any existing snapshots
4049 * not present in the new snapshot context. Add a new snapshot for
4050 * any snaphots in the snapshot context not in the current list.
4051 * And verify there are no changes to snapshots we already know
4052 * about.
4053 *
4054 * Assumes the snapshots in the snapshot context are sorted by
4055 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
4056 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004057 */
Alex Elder304f6802012-08-31 17:29:52 -05004058static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004059{
Alex Elder35938152012-08-02 11:29:46 -05004060 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4061 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05004062 struct list_head *head = &rbd_dev->snaps;
4063 struct list_head *links = head->next;
4064 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004065
Alex Elder9fcbb802012-08-23 23:48:49 -05004066 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05004067 while (index < snap_count || links != head) {
4068 u64 snap_id;
4069 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05004070 char *snap_name;
4071 u64 snap_size = 0;
4072 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004073
Alex Elder35938152012-08-02 11:29:46 -05004074 snap_id = index < snap_count ? snapc->snaps[index]
4075 : CEPH_NOSNAP;
4076 snap = links != head ? list_entry(links, struct rbd_snap, node)
4077 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05004078 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004079
Alex Elder35938152012-08-02 11:29:46 -05004080 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
4081 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004082
Alex Elder6d292902013-01-14 12:43:31 -06004083 /*
4084 * A previously-existing snapshot is not in
4085 * the new snap context.
4086 *
4087 * If the now missing snapshot is the one the
4088 * image is mapped to, clear its exists flag
4089 * so we can avoid sending any more requests
4090 * to it.
4091 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004092 if (rbd_dev->spec->snap_id == snap->id)
Alex Elder6d292902013-01-14 12:43:31 -06004093 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder41f38c22012-10-25 23:34:40 -05004094 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05004095 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004096 rbd_dev->spec->snap_id == snap->id ?
4097 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05004098 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004099
Alex Elder35938152012-08-02 11:29:46 -05004100 /* Done with this list entry; advance */
4101
4102 links = next;
4103 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004104 }
Alex Elder35938152012-08-02 11:29:46 -05004105
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004106 snap_name = rbd_dev_snap_info(rbd_dev, index,
4107 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05004108 if (IS_ERR(snap_name))
4109 return PTR_ERR(snap_name);
4110
Alex Elder9fcbb802012-08-23 23:48:49 -05004111 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
4112 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05004113 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
4114 struct rbd_snap *new_snap;
4115
4116 /* We haven't seen this snapshot before */
4117
Alex Elderc8d18422012-07-10 20:30:11 -05004118 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05004119 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05004120 if (IS_ERR(new_snap)) {
4121 int err = PTR_ERR(new_snap);
4122
4123 dout(" failed to add dev, error %d\n", err);
4124
4125 return err;
4126 }
Alex Elder35938152012-08-02 11:29:46 -05004127
4128 /* New goes before existing, or at end of list */
4129
Alex Elder9fcbb802012-08-23 23:48:49 -05004130 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05004131 if (snap)
4132 list_add_tail(&new_snap->node, &snap->node);
4133 else
Alex Elder523f3252012-08-30 00:16:37 -05004134 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05004135 } else {
4136 /* Already have this one */
4137
Alex Elder9fcbb802012-08-23 23:48:49 -05004138 dout(" already present\n");
4139
Alex Eldercd892122012-07-03 16:01:19 -05004140 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05004141 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05004142 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05004143
4144 /* Done with this list entry; advance */
4145
4146 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004147 }
Alex Elder35938152012-08-02 11:29:46 -05004148
4149 /* Advance to the next entry in the snapshot context */
4150
4151 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004152 }
Alex Elder9fcbb802012-08-23 23:48:49 -05004153 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004154
4155 return 0;
4156}
4157
Alex Elder304f6802012-08-31 17:29:52 -05004158/*
4159 * Scan the list of snapshots and register the devices for any that
4160 * have not already been registered.
4161 */
4162static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
4163{
4164 struct rbd_snap *snap;
4165 int ret = 0;
4166
Alex Elder37206ee2013-02-20 17:32:08 -06004167 dout("%s:\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05004168 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
4169 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05004170
4171 list_for_each_entry(snap, &rbd_dev->snaps, node) {
4172 if (!rbd_snap_registered(snap)) {
4173 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
4174 if (ret < 0)
4175 break;
4176 }
4177 }
4178 dout("%s: returning %d\n", __func__, ret);
4179
4180 return ret;
4181}
4182
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004183static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4184{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004185 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05004186 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004187
4188 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004189
Alex Eldercd789ab2012-08-30 00:16:38 -05004190 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004191 dev->bus = &rbd_bus_type;
4192 dev->type = &rbd_device_type;
4193 dev->parent = &rbd_root_dev;
4194 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05004195 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004196 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004197
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004198 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05004199
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004200 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004201}
4202
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004203static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4204{
4205 device_unregister(&rbd_dev->dev);
4206}
4207
Alex Eldere2839302012-08-29 17:11:06 -05004208static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06004209
4210/*
Alex Elder499afd52012-02-02 08:13:29 -06004211 * Get a unique rbd identifier for the given new rbd_dev, and add
4212 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06004213 */
Alex Eldere2839302012-08-29 17:11:06 -05004214static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06004215{
Alex Eldere2839302012-08-29 17:11:06 -05004216 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06004217
4218 spin_lock(&rbd_dev_list_lock);
4219 list_add_tail(&rbd_dev->node, &rbd_dev_list);
4220 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05004221 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4222 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06004223}
Alex Elderb7f23c32012-01-29 13:57:43 -06004224
Alex Elder1ddbe942012-01-29 13:57:44 -06004225/*
Alex Elder499afd52012-02-02 08:13:29 -06004226 * Remove an rbd_dev from the global list, and record that its
4227 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06004228 */
Alex Eldere2839302012-08-29 17:11:06 -05004229static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06004230{
Alex Elderd184f6b2012-01-29 13:57:44 -06004231 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05004232 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004233 int max_id;
4234
Alex Elderaafb2302012-09-06 16:00:54 -05004235 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06004236
Alex Eldere2839302012-08-29 17:11:06 -05004237 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4238 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06004239 spin_lock(&rbd_dev_list_lock);
4240 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06004241
4242 /*
4243 * If the id being "put" is not the current maximum, there
4244 * is nothing special we need to do.
4245 */
Alex Eldere2839302012-08-29 17:11:06 -05004246 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06004247 spin_unlock(&rbd_dev_list_lock);
4248 return;
4249 }
4250
4251 /*
4252 * We need to update the current maximum id. Search the
4253 * list to find out what it is. We're more likely to find
4254 * the maximum at the end, so search the list backward.
4255 */
4256 max_id = 0;
4257 list_for_each_prev(tmp, &rbd_dev_list) {
4258 struct rbd_device *rbd_dev;
4259
4260 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07004261 if (rbd_dev->dev_id > max_id)
4262 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004263 }
Alex Elder499afd52012-02-02 08:13:29 -06004264 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06004265
Alex Elder1ddbe942012-01-29 13:57:44 -06004266 /*
Alex Eldere2839302012-08-29 17:11:06 -05004267 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06004268 * which case it now accurately reflects the new maximum.
4269 * Be careful not to overwrite the maximum value in that
4270 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06004271 */
Alex Eldere2839302012-08-29 17:11:06 -05004272 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4273 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06004274}
4275
Alex Eldera725f65e2012-02-02 08:13:30 -06004276/*
Alex Eldere28fff262012-02-02 08:13:30 -06004277 * Skips over white space at *buf, and updates *buf to point to the
4278 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004279 * the token (string of non-white space characters) found. Note
4280 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004281 */
4282static inline size_t next_token(const char **buf)
4283{
4284 /*
4285 * These are the characters that produce nonzero for
4286 * isspace() in the "C" and "POSIX" locales.
4287 */
4288 const char *spaces = " \f\n\r\t\v";
4289
4290 *buf += strspn(*buf, spaces); /* Find start of token */
4291
4292 return strcspn(*buf, spaces); /* Return token length */
4293}
4294
4295/*
4296 * Finds the next token in *buf, and if the provided token buffer is
4297 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06004298 * copied, is guaranteed to be terminated with '\0'. Note that *buf
4299 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06004300 *
4301 * Returns the length of the token found (not including the '\0').
4302 * Return value will be 0 if no token is found, and it will be >=
4303 * token_size if the token would not fit.
4304 *
Alex Elder593a9e72012-02-07 12:03:37 -06004305 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06004306 * found token. Note that this occurs even if the token buffer is
4307 * too small to hold it.
4308 */
4309static inline size_t copy_token(const char **buf,
4310 char *token,
4311 size_t token_size)
4312{
4313 size_t len;
4314
4315 len = next_token(buf);
4316 if (len < token_size) {
4317 memcpy(token, *buf, len);
4318 *(token + len) = '\0';
4319 }
4320 *buf += len;
4321
4322 return len;
4323}
4324
4325/*
Alex Elderea3352f2012-07-09 21:04:23 -05004326 * Finds the next token in *buf, dynamically allocates a buffer big
4327 * enough to hold a copy of it, and copies the token into the new
4328 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4329 * that a duplicate buffer is created even for a zero-length token.
4330 *
4331 * Returns a pointer to the newly-allocated duplicate, or a null
4332 * pointer if memory for the duplicate was not available. If
4333 * the lenp argument is a non-null pointer, the length of the token
4334 * (not including the '\0') is returned in *lenp.
4335 *
4336 * If successful, the *buf pointer will be updated to point beyond
4337 * the end of the found token.
4338 *
4339 * Note: uses GFP_KERNEL for allocation.
4340 */
4341static inline char *dup_token(const char **buf, size_t *lenp)
4342{
4343 char *dup;
4344 size_t len;
4345
4346 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004347 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004348 if (!dup)
4349 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004350 *(dup + len) = '\0';
4351 *buf += len;
4352
4353 if (lenp)
4354 *lenp = len;
4355
4356 return dup;
4357}
4358
4359/*
Alex Elder859c31d2012-10-25 23:34:42 -05004360 * Parse the options provided for an "rbd add" (i.e., rbd image
4361 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4362 * and the data written is passed here via a NUL-terminated buffer.
4363 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004364 *
Alex Elder859c31d2012-10-25 23:34:42 -05004365 * The information extracted from these options is recorded in
4366 * the other parameters which return dynamically-allocated
4367 * structures:
4368 * ceph_opts
4369 * The address of a pointer that will refer to a ceph options
4370 * structure. Caller must release the returned pointer using
4371 * ceph_destroy_options() when it is no longer needed.
4372 * rbd_opts
4373 * Address of an rbd options pointer. Fully initialized by
4374 * this function; caller must release with kfree().
4375 * spec
4376 * Address of an rbd image specification pointer. Fully
4377 * initialized by this function based on parsed options.
4378 * Caller must release with rbd_spec_put().
4379 *
4380 * The options passed take this form:
4381 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4382 * where:
4383 * <mon_addrs>
4384 * A comma-separated list of one or more monitor addresses.
4385 * A monitor address is an ip address, optionally followed
4386 * by a port number (separated by a colon).
4387 * I.e.: ip1[:port1][,ip2[:port2]...]
4388 * <options>
4389 * A comma-separated list of ceph and/or rbd options.
4390 * <pool_name>
4391 * The name of the rados pool containing the rbd image.
4392 * <image_name>
4393 * The name of the image in that pool to map.
4394 * <snap_id>
4395 * An optional snapshot id. If provided, the mapping will
4396 * present data from the image at the time that snapshot was
4397 * created. The image head is used if no snapshot id is
4398 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004399 */
Alex Elder859c31d2012-10-25 23:34:42 -05004400static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004401 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004402 struct rbd_options **opts,
4403 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004404{
Alex Elderd22f76e2012-07-12 10:46:35 -05004405 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004406 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004407 const char *mon_addrs;
4408 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004409 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004410 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004411 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004412 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004413
4414 /* The first four tokens are required */
4415
Alex Elder7ef32142012-02-02 08:13:30 -06004416 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004417 if (!len) {
4418 rbd_warn(NULL, "no monitor address(es) provided");
4419 return -EINVAL;
4420 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004421 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004422 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004423 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004424
Alex Elderdc79b112012-10-25 23:34:41 -05004425 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004426 options = dup_token(&buf, NULL);
4427 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004428 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004429 if (!*options) {
4430 rbd_warn(NULL, "no options provided");
4431 goto out_err;
4432 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004433
Alex Elder859c31d2012-10-25 23:34:42 -05004434 spec = rbd_spec_alloc();
4435 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004436 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004437
4438 spec->pool_name = dup_token(&buf, NULL);
4439 if (!spec->pool_name)
4440 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004441 if (!*spec->pool_name) {
4442 rbd_warn(NULL, "no pool name provided");
4443 goto out_err;
4444 }
Alex Eldere28fff262012-02-02 08:13:30 -06004445
Alex Elder69e7a022012-11-01 08:39:26 -05004446 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004447 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004448 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004449 if (!*spec->image_name) {
4450 rbd_warn(NULL, "no image name provided");
4451 goto out_err;
4452 }
Alex Eldere28fff262012-02-02 08:13:30 -06004453
Alex Elderf28e5652012-10-25 23:34:41 -05004454 /*
4455 * Snapshot name is optional; default is to use "-"
4456 * (indicating the head/no snapshot).
4457 */
Alex Elder3feeb8942012-08-31 17:29:52 -05004458 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05004459 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05004460 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4461 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05004462 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05004463 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05004464 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05004465 }
Alex Elder4caf35f2012-11-01 08:39:27 -05004466 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05004467 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004468 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004469 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05004470
Alex Elder0ddebc02012-10-25 23:34:41 -05004471 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06004472
Alex Elder4e9afeb2012-10-25 23:34:41 -05004473 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4474 if (!rbd_opts)
4475 goto out_mem;
4476
4477 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05004478
Alex Elder859c31d2012-10-25 23:34:42 -05004479 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05004480 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05004481 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004482 if (IS_ERR(copts)) {
4483 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05004484 goto out_err;
4485 }
Alex Elder859c31d2012-10-25 23:34:42 -05004486 kfree(options);
4487
4488 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004489 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05004490 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05004491
Alex Elderdc79b112012-10-25 23:34:41 -05004492 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05004493out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05004494 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05004495out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05004496 kfree(rbd_opts);
4497 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05004498 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05004499
Alex Elderdc79b112012-10-25 23:34:41 -05004500 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06004501}
4502
Alex Elder589d30e2012-07-10 20:30:11 -05004503/*
4504 * An rbd format 2 image has a unique identifier, distinct from the
4505 * name given to it by the user. Internally, that identifier is
4506 * what's used to specify the names of objects related to the image.
4507 *
4508 * A special "rbd id" object is used to map an rbd image name to its
4509 * id. If that object doesn't exist, then there is no v2 rbd image
4510 * with the supplied name.
4511 *
4512 * This function will record the given rbd_dev's image_id field if
4513 * it can be determined, and in that case will return 0. If any
4514 * errors occur a negative errno will be returned and the rbd_dev's
4515 * image_id field will be unchanged (and should be NULL).
4516 */
4517static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4518{
4519 int ret;
4520 size_t size;
4521 char *object_name;
4522 void *response;
4523 void *p;
4524
Alex Elder2f82ee52012-10-30 19:40:33 -05004525 /* If we already have it we don't need to look it up */
4526
4527 if (rbd_dev->spec->image_id)
4528 return 0;
4529
Alex Elder589d30e2012-07-10 20:30:11 -05004530 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05004531 * When probing a parent image, the image id is already
4532 * known (and the image name likely is not). There's no
4533 * need to fetch the image id again in this case.
4534 */
4535 if (rbd_dev->spec->image_id)
4536 return 0;
4537
4538 /*
Alex Elder589d30e2012-07-10 20:30:11 -05004539 * First, see if the format 2 image id file exists, and if
4540 * so, get the image's persistent id from it.
4541 */
Alex Elder69e7a022012-11-01 08:39:26 -05004542 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004543 object_name = kmalloc(size, GFP_NOIO);
4544 if (!object_name)
4545 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004546 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004547 dout("rbd id object name is %s\n", object_name);
4548
4549 /* Response will be an encoded string, which includes a length */
4550
4551 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4552 response = kzalloc(size, GFP_NOIO);
4553 if (!response) {
4554 ret = -ENOMEM;
4555 goto out;
4556 }
4557
Alex Elder36be9a72013-01-19 00:30:28 -06004558 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder589d30e2012-07-10 20:30:11 -05004559 "rbd", "get_id",
4560 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06004561 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06004562 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder589d30e2012-07-10 20:30:11 -05004563 if (ret < 0)
4564 goto out;
4565
4566 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004567 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05004568 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05004569 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004570 if (IS_ERR(rbd_dev->spec->image_id)) {
4571 ret = PTR_ERR(rbd_dev->spec->image_id);
4572 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05004573 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004574 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004575 }
4576out:
4577 kfree(response);
4578 kfree(object_name);
4579
4580 return ret;
4581}
4582
Alex Eldera30b71b2012-07-10 20:30:11 -05004583static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4584{
4585 int ret;
4586 size_t size;
4587
4588 /* Version 1 images have no id; empty string is used */
4589
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004590 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
4591 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05004592 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05004593
4594 /* Record the header object name for this rbd image. */
4595
Alex Elder69e7a022012-11-01 08:39:26 -05004596 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05004597 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4598 if (!rbd_dev->header_name) {
4599 ret = -ENOMEM;
4600 goto out_err;
4601 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004602 sprintf(rbd_dev->header_name, "%s%s",
4603 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05004604
4605 /* Populate rbd image metadata */
4606
4607 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4608 if (ret < 0)
4609 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05004610
4611 /* Version 1 images have no parent (no layering) */
4612
4613 rbd_dev->parent_spec = NULL;
4614 rbd_dev->parent_overlap = 0;
4615
Alex Eldera30b71b2012-07-10 20:30:11 -05004616 rbd_dev->image_format = 1;
4617
4618 dout("discovered version 1 image, header name is %s\n",
4619 rbd_dev->header_name);
4620
4621 return 0;
4622
4623out_err:
4624 kfree(rbd_dev->header_name);
4625 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004626 kfree(rbd_dev->spec->image_id);
4627 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05004628
4629 return ret;
4630}
4631
4632static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4633{
4634 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05004635 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004636 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05004637
4638 /*
4639 * Image id was filled in by the caller. Record the header
4640 * object name for this rbd image.
4641 */
Alex Elder979ed482012-11-01 08:39:26 -05004642 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05004643 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4644 if (!rbd_dev->header_name)
4645 return -ENOMEM;
4646 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004647 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05004648
4649 /* Get the size and object order for the image */
4650
4651 ret = rbd_dev_v2_image_size(rbd_dev);
4652 if (ret < 0)
4653 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05004654
4655 /* Get the object prefix (a.k.a. block_name) for the image */
4656
4657 ret = rbd_dev_v2_object_prefix(rbd_dev);
4658 if (ret < 0)
4659 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05004660
Alex Elderd8891402012-10-09 13:50:17 -07004661 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05004662
4663 ret = rbd_dev_v2_features(rbd_dev);
4664 if (ret < 0)
4665 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05004666
Alex Elder86b00e02012-10-25 23:34:42 -05004667 /* If the image supports layering, get the parent info */
4668
4669 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4670 ret = rbd_dev_v2_parent_info(rbd_dev);
4671 if (ret < 0)
4672 goto out_err;
4673 }
4674
Alex Elder6e14b1a2012-07-03 16:01:19 -05004675 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05004676
Alex Elder6e14b1a2012-07-03 16:01:19 -05004677 rbd_dev->header.crypt_type = 0;
4678 rbd_dev->header.comp_type = 0;
4679
4680 /* Get the snapshot context, plus the header version */
4681
4682 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05004683 if (ret)
4684 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004685 rbd_dev->header.obj_version = ver;
4686
Alex Eldera30b71b2012-07-10 20:30:11 -05004687 rbd_dev->image_format = 2;
4688
4689 dout("discovered version 2 image, header name is %s\n",
4690 rbd_dev->header_name);
4691
Alex Elder35152972012-08-31 17:29:55 -05004692 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05004693out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05004694 rbd_dev->parent_overlap = 0;
4695 rbd_spec_put(rbd_dev->parent_spec);
4696 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004697 kfree(rbd_dev->header_name);
4698 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05004699 kfree(rbd_dev->header.object_prefix);
4700 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004701
4702 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004703}
4704
Alex Elder83a06262012-10-30 15:47:17 -05004705static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
4706{
Alex Elder2f82ee52012-10-30 19:40:33 -05004707 struct rbd_device *parent = NULL;
4708 struct rbd_spec *parent_spec = NULL;
4709 struct rbd_client *rbdc = NULL;
Alex Elder83a06262012-10-30 15:47:17 -05004710 int ret;
4711
4712 /* no need to lock here, as rbd_dev is not registered yet */
4713 ret = rbd_dev_snaps_update(rbd_dev);
4714 if (ret)
4715 return ret;
4716
Alex Elder9e15b772012-10-30 19:40:33 -05004717 ret = rbd_dev_probe_update_spec(rbd_dev);
4718 if (ret)
4719 goto err_out_snaps;
4720
Alex Elder83a06262012-10-30 15:47:17 -05004721 ret = rbd_dev_set_mapping(rbd_dev);
4722 if (ret)
4723 goto err_out_snaps;
4724
4725 /* generate unique id: find highest unique id, add one */
4726 rbd_dev_id_get(rbd_dev);
4727
4728 /* Fill in the device name, now that we have its id. */
4729 BUILD_BUG_ON(DEV_NAME_LEN
4730 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4731 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4732
4733 /* Get our block major device number. */
4734
4735 ret = register_blkdev(0, rbd_dev->name);
4736 if (ret < 0)
4737 goto err_out_id;
4738 rbd_dev->major = ret;
4739
4740 /* Set up the blkdev mapping. */
4741
4742 ret = rbd_init_disk(rbd_dev);
4743 if (ret)
4744 goto err_out_blkdev;
4745
4746 ret = rbd_bus_add_dev(rbd_dev);
4747 if (ret)
4748 goto err_out_disk;
4749
4750 /*
4751 * At this point cleanup in the event of an error is the job
4752 * of the sysfs code (initiated by rbd_bus_del_dev()).
4753 */
Alex Elder2f82ee52012-10-30 19:40:33 -05004754 /* Probe the parent if there is one */
4755
4756 if (rbd_dev->parent_spec) {
4757 /*
4758 * We need to pass a reference to the client and the
4759 * parent spec when creating the parent rbd_dev.
4760 * Images related by parent/child relationships
4761 * always share both.
4762 */
4763 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4764 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4765
4766 parent = rbd_dev_create(rbdc, parent_spec);
4767 if (!parent) {
4768 ret = -ENOMEM;
4769 goto err_out_spec;
4770 }
4771 rbdc = NULL; /* parent now owns reference */
4772 parent_spec = NULL; /* parent now owns reference */
4773 ret = rbd_dev_probe(parent);
4774 if (ret < 0)
4775 goto err_out_parent;
4776 rbd_dev->parent = parent;
4777 }
4778
Alex Elder83a06262012-10-30 15:47:17 -05004779 down_write(&rbd_dev->header_rwsem);
4780 ret = rbd_dev_snaps_register(rbd_dev);
4781 up_write(&rbd_dev->header_rwsem);
4782 if (ret)
4783 goto err_out_bus;
4784
Alex Elder9969ebc2013-01-18 12:31:10 -06004785 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05004786 if (ret)
4787 goto err_out_bus;
4788
4789 /* Everything's ready. Announce the disk to the world. */
4790
4791 add_disk(rbd_dev->disk);
4792
4793 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4794 (unsigned long long) rbd_dev->mapping.size);
4795
4796 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05004797
4798err_out_parent:
4799 rbd_dev_destroy(parent);
4800err_out_spec:
4801 rbd_spec_put(parent_spec);
4802 rbd_put_client(rbdc);
Alex Elder83a06262012-10-30 15:47:17 -05004803err_out_bus:
4804 /* this will also clean up rest of rbd_dev stuff */
4805
4806 rbd_bus_del_dev(rbd_dev);
4807
4808 return ret;
4809err_out_disk:
4810 rbd_free_disk(rbd_dev);
4811err_out_blkdev:
4812 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4813err_out_id:
4814 rbd_dev_id_put(rbd_dev);
4815err_out_snaps:
4816 rbd_remove_all_snaps(rbd_dev);
4817
4818 return ret;
4819}
4820
Alex Eldera30b71b2012-07-10 20:30:11 -05004821/*
4822 * Probe for the existence of the header object for the given rbd
4823 * device. For format 2 images this includes determining the image
4824 * id.
4825 */
4826static int rbd_dev_probe(struct rbd_device *rbd_dev)
4827{
4828 int ret;
4829
4830 /*
4831 * Get the id from the image id object. If it's not a
4832 * format 2 image, we'll get ENOENT back, and we'll assume
4833 * it's a format 1 image.
4834 */
4835 ret = rbd_dev_image_id(rbd_dev);
4836 if (ret)
4837 ret = rbd_dev_v1_probe(rbd_dev);
4838 else
4839 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004840 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05004841 dout("probe failed, returning %d\n", ret);
4842
Alex Elder83a06262012-10-30 15:47:17 -05004843 return ret;
4844 }
4845
4846 ret = rbd_dev_probe_finish(rbd_dev);
4847 if (ret)
4848 rbd_header_free(&rbd_dev->header);
4849
Alex Eldera30b71b2012-07-10 20:30:11 -05004850 return ret;
4851}
4852
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004853static ssize_t rbd_add(struct bus_type *bus,
4854 const char *buf,
4855 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004856{
Alex Eldercb8627c2012-07-09 21:04:23 -05004857 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004858 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004859 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004860 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004861 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004862 struct ceph_osd_client *osdc;
4863 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004864
4865 if (!try_module_get(THIS_MODULE))
4866 return -ENODEV;
4867
Alex Eldera725f65e2012-02-02 08:13:30 -06004868 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004869 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004870 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004871 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004872
Alex Elder9d3997f2012-10-25 23:34:42 -05004873 rbdc = rbd_get_client(ceph_opts);
4874 if (IS_ERR(rbdc)) {
4875 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004876 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004877 }
Alex Elderc53d5892012-10-25 23:34:42 -05004878 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004879
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004880 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004881 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004882 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004883 if (rc < 0)
4884 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004885 spec->pool_id = (u64) rc;
4886
Alex Elder0903e872012-11-14 12:25:19 -06004887 /* The ceph file layout needs to fit pool id in 32 bits */
4888
4889 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4890 rc = -EIO;
4891 goto err_out_client;
4892 }
4893
Alex Elderc53d5892012-10-25 23:34:42 -05004894 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004895 if (!rbd_dev)
4896 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004897 rbdc = NULL; /* rbd_dev now owns this */
4898 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004899
Alex Elderbd4ba652012-10-25 23:34:42 -05004900 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004901 kfree(rbd_opts);
4902 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004903
Alex Eldera30b71b2012-07-10 20:30:11 -05004904 rc = rbd_dev_probe(rbd_dev);
4905 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004906 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004907
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004908 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004909err_out_rbd_dev:
4910 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004911err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004912 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004913err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004914 if (ceph_opts)
4915 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004916 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004917 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004918err_out_module:
4919 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004920
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004921 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004922
4923 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004924}
4925
Alex Elderde71a292012-07-03 16:01:19 -05004926static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004927{
4928 struct list_head *tmp;
4929 struct rbd_device *rbd_dev;
4930
Alex Eldere124a82f2012-01-29 13:57:44 -06004931 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004932 list_for_each(tmp, &rbd_dev_list) {
4933 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004934 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06004935 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004936 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06004937 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004938 }
Alex Eldere124a82f2012-01-29 13:57:44 -06004939 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004940 return NULL;
4941}
4942
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004943static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004944{
Alex Elder593a9e72012-02-07 12:03:37 -06004945 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004946
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004947 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004948 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004949
4950 /* clean up and free blkdev */
4951 rbd_free_disk(rbd_dev);
4952 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004953
Alex Elder2ac4e752012-07-10 20:30:10 -05004954 /* release allocated disk header fields */
4955 rbd_header_free(&rbd_dev->header);
4956
Alex Elder32eec682012-02-08 16:11:14 -06004957 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004958 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004959 rbd_assert(rbd_dev->rbd_client != NULL);
4960 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004961
4962 /* release module ref */
4963 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004964}
4965
Alex Elder2f82ee52012-10-30 19:40:33 -05004966static void __rbd_remove(struct rbd_device *rbd_dev)
4967{
4968 rbd_remove_all_snaps(rbd_dev);
4969 rbd_bus_del_dev(rbd_dev);
4970}
4971
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004972static ssize_t rbd_remove(struct bus_type *bus,
4973 const char *buf,
4974 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004975{
4976 struct rbd_device *rbd_dev = NULL;
4977 int target_id, rc;
4978 unsigned long ul;
4979 int ret = count;
4980
4981 rc = strict_strtoul(buf, 10, &ul);
4982 if (rc)
4983 return rc;
4984
4985 /* convert to int; abort if we lost anything in the conversion */
4986 target_id = (int) ul;
4987 if (target_id != ul)
4988 return -EINVAL;
4989
4990 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4991
4992 rbd_dev = __rbd_get_dev(target_id);
4993 if (!rbd_dev) {
4994 ret = -ENOENT;
4995 goto done;
4996 }
4997
Alex Eldera14ea262013-02-05 13:23:12 -06004998 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004999 if (rbd_dev->open_count)
Alex Elder42382b72012-11-16 09:29:16 -06005000 ret = -EBUSY;
Alex Elderb82d1672013-01-14 12:43:31 -06005001 else
5002 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
Alex Eldera14ea262013-02-05 13:23:12 -06005003 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06005004 if (ret < 0)
Alex Elder42382b72012-11-16 09:29:16 -06005005 goto done;
Alex Elder42382b72012-11-16 09:29:16 -06005006
Alex Elder2f82ee52012-10-30 19:40:33 -05005007 while (rbd_dev->parent_spec) {
5008 struct rbd_device *first = rbd_dev;
5009 struct rbd_device *second = first->parent;
5010 struct rbd_device *third;
5011
5012 /*
5013 * Follow to the parent with no grandparent and
5014 * remove it.
5015 */
5016 while (second && (third = second->parent)) {
5017 first = second;
5018 second = third;
5019 }
5020 __rbd_remove(second);
5021 rbd_spec_put(first->parent_spec);
5022 first->parent_spec = NULL;
5023 first->parent_overlap = 0;
5024 first->parent = NULL;
5025 }
5026 __rbd_remove(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005027
5028done:
5029 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05005030
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005031 return ret;
5032}
5033
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005034/*
5035 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005036 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005037 */
5038static int rbd_sysfs_init(void)
5039{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005040 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005041
Alex Elderfed4c142012-02-07 12:03:36 -06005042 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005043 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005044 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005045
Alex Elderfed4c142012-02-07 12:03:36 -06005046 ret = bus_register(&rbd_bus_type);
5047 if (ret < 0)
5048 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005049
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005050 return ret;
5051}
5052
5053static void rbd_sysfs_cleanup(void)
5054{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005055 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005056 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005057}
5058
Alex Eldercc344fa2013-02-19 12:25:56 -06005059static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005060{
5061 int rc;
5062
Alex Elder1e32d342013-01-30 11:13:33 -06005063 if (!libceph_compatible(NULL)) {
5064 rbd_warn(NULL, "libceph incompatibility (quitting)");
5065
5066 return -EINVAL;
5067 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005068 rc = rbd_sysfs_init();
5069 if (rc)
5070 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06005071 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005072 return 0;
5073}
5074
Alex Eldercc344fa2013-02-19 12:25:56 -06005075static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005076{
5077 rbd_sysfs_cleanup();
5078}
5079
5080module_init(rbd_init);
5081module_exit(rbd_exit);
5082
5083MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5084MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5085MODULE_DESCRIPTION("rados block device");
5086
5087/* following authorship retained from original osdblk.c */
5088MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5089
5090MODULE_LICENSE("GPL");