blob: b7b7a88d9f689cdd3697ac8074de1887e7d50c15 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
Alex Elderbf0d5f502012-11-22 00:00:08 -0600165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
Alex Elder9969ebc2013-01-18 12:31:10 -0600173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600176
177struct rbd_obj_request {
178 const char *object_name;
179 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */
181
182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
Sage Weil1b83bef2013-02-25 16:11:12 -0800199 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600200 atomic_t done;
201
202 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600203 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204
205 struct kref kref;
206};
207
208struct rbd_img_request {
209 struct request *rq;
210 struct rbd_device *rbd_dev;
211 u64 offset; /* starting image byte offset */
212 u64 length; /* byte count from offset */
213 bool write_request; /* false for read */
214 union {
215 struct ceph_snap_context *snapc; /* for writes */
216 u64 snap_id; /* for reads */
217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
226};
227
228#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600229 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600230#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600231 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600232#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600233 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235struct rbd_snap {
236 struct device dev;
237 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800238 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239 struct list_head node;
240 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500241 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800242};
243
Alex Elderf84344f2012-08-31 17:29:51 -0500244struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500245 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500246 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500247 bool read_only;
248};
249
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250/*
251 * a single device
252 */
253struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500254 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255
256 int major; /* blkdev assigned major */
257 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258
Alex Eldera30b71b2012-07-10 20:30:11 -0500259 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260 struct rbd_client *rbd_client;
261
262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263
Alex Elderb82d1672013-01-14 12:43:31 -0600264 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700265
266 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600267 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500268 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500270 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500271
Alex Elder0903e872012-11-14 12:25:19 -0600272 struct ceph_file_layout layout;
273
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700274 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600275 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700276
Alex Elder86b00e02012-10-25 23:34:42 -0500277 struct rbd_spec *parent_spec;
278 u64 parent_overlap;
279
Josh Durginc6666012011-11-21 17:11:12 -0800280 /* protects updating the header */
281 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500282
283 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284
285 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800286
287 /* list of snapshots */
288 struct list_head snaps;
289
290 /* sysfs related */
291 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600292 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800293};
294
Alex Elderb82d1672013-01-14 12:43:31 -0600295/*
296 * Flag bits for rbd_dev->flags. If atomicity is required,
297 * rbd_dev->lock is used to protect access.
298 *
299 * Currently, only the "removing" flag (which is coupled with the
300 * "open_count" field) requires atomic access.
301 */
Alex Elder6d292902013-01-14 12:43:31 -0600302enum rbd_dev_flags {
303 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600304 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600305};
306
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600308
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600310static DEFINE_SPINLOCK(rbd_dev_list_lock);
311
Alex Elder432b8582012-01-29 13:57:44 -0600312static LIST_HEAD(rbd_client_list); /* clients */
313static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700314
Alex Elder304f6802012-08-31 17:29:52 -0500315static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
316static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
317
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800318static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500319static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800320
Alex Elderf0f8cef2012-01-29 13:57:44 -0600321static ssize_t rbd_add(struct bus_type *bus, const char *buf,
322 size_t count);
323static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
324 size_t count);
325
326static struct bus_attribute rbd_bus_attrs[] = {
327 __ATTR(add, S_IWUSR, NULL, rbd_add),
328 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
329 __ATTR_NULL
330};
331
332static struct bus_type rbd_bus_type = {
333 .name = "rbd",
334 .bus_attrs = rbd_bus_attrs,
335};
336
337static void rbd_root_dev_release(struct device *dev)
338{
339}
340
341static struct device rbd_root_dev = {
342 .init_name = "rbd",
343 .release = rbd_root_dev_release,
344};
345
Alex Elder06ecc6c2012-11-01 10:17:15 -0500346static __printf(2, 3)
347void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
348{
349 struct va_format vaf;
350 va_list args;
351
352 va_start(args, fmt);
353 vaf.fmt = fmt;
354 vaf.va = &args;
355
356 if (!rbd_dev)
357 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
358 else if (rbd_dev->disk)
359 printk(KERN_WARNING "%s: %s: %pV\n",
360 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
361 else if (rbd_dev->spec && rbd_dev->spec->image_name)
362 printk(KERN_WARNING "%s: image %s: %pV\n",
363 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
364 else if (rbd_dev->spec && rbd_dev->spec->image_id)
365 printk(KERN_WARNING "%s: id %s: %pV\n",
366 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
367 else /* punt */
368 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
369 RBD_DRV_NAME, rbd_dev, &vaf);
370 va_end(args);
371}
372
Alex Elderaafb230e2012-09-06 16:00:54 -0500373#ifdef RBD_DEBUG
374#define rbd_assert(expr) \
375 if (unlikely(!(expr))) { \
376 printk(KERN_ERR "\nAssertion failure in %s() " \
377 "at line %d:\n\n" \
378 "\trbd_assert(%s);\n\n", \
379 __func__, __LINE__, #expr); \
380 BUG(); \
381 }
382#else /* !RBD_DEBUG */
383# define rbd_assert(expr) ((void) 0)
384#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800385
Alex Elder117973f2012-08-31 17:29:55 -0500386static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
387static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700388
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700389static int rbd_open(struct block_device *bdev, fmode_t mode)
390{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600391 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600392 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700393
Alex Elderf84344f2012-08-31 17:29:51 -0500394 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700395 return -EROFS;
396
Alex Eldera14ea262013-02-05 13:23:12 -0600397 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600398 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
399 removing = true;
400 else
401 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600402 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600403 if (removing)
404 return -ENOENT;
405
Alex Elder42382b72012-11-16 09:29:16 -0600406 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600407 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500408 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600409 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700410
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700411 return 0;
412}
413
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800414static int rbd_release(struct gendisk *disk, fmode_t mode)
415{
416 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600417 unsigned long open_count_before;
418
Alex Eldera14ea262013-02-05 13:23:12 -0600419 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600420 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600421 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600422 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800423
Alex Elder42382b72012-11-16 09:29:16 -0600424 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600425 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600426 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800427
428 return 0;
429}
430
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431static const struct block_device_operations rbd_bd_ops = {
432 .owner = THIS_MODULE,
433 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800434 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435};
436
437/*
438 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500439 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440 */
Alex Elderf8c38922012-08-10 13:12:07 -0700441static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700442{
443 struct rbd_client *rbdc;
444 int ret = -ENOMEM;
445
Alex Elder37206ee2013-02-20 17:32:08 -0600446 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700447 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
448 if (!rbdc)
449 goto out_opt;
450
451 kref_init(&rbdc->kref);
452 INIT_LIST_HEAD(&rbdc->node);
453
Alex Elderbc534d82012-01-29 13:57:44 -0600454 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
455
Alex Elder43ae4702012-07-03 16:01:18 -0500456 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600458 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500459 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700460
461 ret = ceph_open_session(rbdc->client);
462 if (ret < 0)
463 goto out_err;
464
Alex Elder432b8582012-01-29 13:57:44 -0600465 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700466 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600467 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468
Alex Elderbc534d82012-01-29 13:57:44 -0600469 mutex_unlock(&ctl_mutex);
Alex Elder37206ee2013-02-20 17:32:08 -0600470 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600471
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700472 return rbdc;
473
474out_err:
475 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600476out_mutex:
477 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700478 kfree(rbdc);
479out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500480 if (ceph_opts)
481 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600482 dout("%s: error %d\n", __func__, ret);
483
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400484 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700485}
486
487/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700488 * Find a ceph client with specific addr and configuration. If
489 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700490 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700491static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700492{
493 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700494 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495
Alex Elder43ae4702012-07-03 16:01:18 -0500496 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700497 return NULL;
498
Alex Elder1f7ba332012-08-10 13:12:07 -0700499 spin_lock(&rbd_client_list_lock);
500 list_for_each_entry(client_node, &rbd_client_list, node) {
501 if (!ceph_compare_options(ceph_opts, client_node->client)) {
502 kref_get(&client_node->kref);
503 found = true;
504 break;
505 }
506 }
507 spin_unlock(&rbd_client_list_lock);
508
509 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700510}
511
512/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700513 * mount options
514 */
515enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700516 Opt_last_int,
517 /* int args above */
518 Opt_last_string,
519 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700520 Opt_read_only,
521 Opt_read_write,
522 /* Boolean args above */
523 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700524};
525
Alex Elder43ae4702012-07-03 16:01:18 -0500526static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700527 /* int args above */
528 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500529 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700530 {Opt_read_only, "ro"}, /* Alternate spelling */
531 {Opt_read_write, "read_write"},
532 {Opt_read_write, "rw"}, /* Alternate spelling */
533 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700534 {-1, NULL}
535};
536
Alex Elder98571b52013-01-20 14:44:42 -0600537struct rbd_options {
538 bool read_only;
539};
540
541#define RBD_READ_ONLY_DEFAULT false
542
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700543static int parse_rbd_opts_token(char *c, void *private)
544{
Alex Elder43ae4702012-07-03 16:01:18 -0500545 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700546 substring_t argstr[MAX_OPT_ARGS];
547 int token, intval, ret;
548
Alex Elder43ae4702012-07-03 16:01:18 -0500549 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700550 if (token < 0)
551 return -EINVAL;
552
553 if (token < Opt_last_int) {
554 ret = match_int(&argstr[0], &intval);
555 if (ret < 0) {
556 pr_err("bad mount option arg (not int) "
557 "at '%s'\n", c);
558 return ret;
559 }
560 dout("got int token %d val %d\n", token, intval);
561 } else if (token > Opt_last_int && token < Opt_last_string) {
562 dout("got string token %d val %s\n", token,
563 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700564 } else if (token > Opt_last_string && token < Opt_last_bool) {
565 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700566 } else {
567 dout("got token %d\n", token);
568 }
569
570 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700571 case Opt_read_only:
572 rbd_opts->read_only = true;
573 break;
574 case Opt_read_write:
575 rbd_opts->read_only = false;
576 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700577 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500578 rbd_assert(false);
579 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700580 }
581 return 0;
582}
583
584/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 * Get a ceph client with specific addr and configuration, if one does
586 * not exist create it.
587 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500588static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700589{
Alex Elderf8c38922012-08-10 13:12:07 -0700590 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700591
Alex Elder1f7ba332012-08-10 13:12:07 -0700592 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500593 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500594 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500595 else
Alex Elderf8c38922012-08-10 13:12:07 -0700596 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597
Alex Elder9d3997f2012-10-25 23:34:42 -0500598 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599}
600
601/*
602 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600603 *
Alex Elder432b8582012-01-29 13:57:44 -0600604 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605 */
606static void rbd_client_release(struct kref *kref)
607{
608 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
609
Alex Elder37206ee2013-02-20 17:32:08 -0600610 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500611 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500613 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700614
615 ceph_destroy_client(rbdc->client);
616 kfree(rbdc);
617}
618
619/*
620 * Drop reference to ceph client node. If it's not referenced anymore, release
621 * it.
622 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500623static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624{
Alex Elderc53d5892012-10-25 23:34:42 -0500625 if (rbdc)
626 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700627}
628
Alex Eldera30b71b2012-07-10 20:30:11 -0500629static bool rbd_image_format_valid(u32 image_format)
630{
631 return image_format == 1 || image_format == 2;
632}
633
Alex Elder8e94af82012-07-25 09:32:40 -0500634static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
635{
Alex Elder103a1502012-08-02 11:29:45 -0500636 size_t size;
637 u32 snap_count;
638
639 /* The header has to start with the magic rbd header text */
640 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
641 return false;
642
Alex Elderdb2388b2012-10-20 22:17:27 -0500643 /* The bio layer requires at least sector-sized I/O */
644
645 if (ondisk->options.order < SECTOR_SHIFT)
646 return false;
647
648 /* If we use u64 in a few spots we may be able to loosen this */
649
650 if (ondisk->options.order > 8 * sizeof (int) - 1)
651 return false;
652
Alex Elder103a1502012-08-02 11:29:45 -0500653 /*
654 * The size of a snapshot header has to fit in a size_t, and
655 * that limits the number of snapshots.
656 */
657 snap_count = le32_to_cpu(ondisk->snap_count);
658 size = SIZE_MAX - sizeof (struct ceph_snap_context);
659 if (snap_count > size / sizeof (__le64))
660 return false;
661
662 /*
663 * Not only that, but the size of the entire the snapshot
664 * header must also be representable in a size_t.
665 */
666 size -= snap_count * sizeof (__le64);
667 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
668 return false;
669
670 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500671}
672
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673/*
674 * Create a new header structure, translate header format from the on-disk
675 * header.
676 */
677static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500678 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679{
Alex Elderccece232012-07-10 20:30:10 -0500680 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500681 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500682 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500683 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700684
Alex Elder6a523252012-07-19 17:12:59 -0500685 memset(header, 0, sizeof (*header));
686
Alex Elder103a1502012-08-02 11:29:45 -0500687 snap_count = le32_to_cpu(ondisk->snap_count);
688
Alex Elder58c17b02012-08-23 23:22:06 -0500689 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
690 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500691 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500693 memcpy(header->object_prefix, ondisk->object_prefix, len);
694 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600695
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500697 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
698
Alex Elder621901d2012-08-23 23:22:06 -0500699 /* Save a copy of the snapshot names */
700
Alex Elderf785cc12012-08-23 23:22:06 -0500701 if (snap_names_len > (u64) SIZE_MAX)
702 return -EIO;
703 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500705 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500706 /*
707 * Note that rbd_dev_v1_header_read() guarantees
708 * the ondisk buffer we're working with has
709 * snap_names_len bytes beyond the end of the
710 * snapshot id array, this memcpy() is safe.
711 */
712 memcpy(header->snap_names, &ondisk->snaps[snap_count],
713 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500714
Alex Elder621901d2012-08-23 23:22:06 -0500715 /* Record each snapshot's size */
716
Alex Elderd2bb24e2012-07-26 23:37:14 -0500717 size = snap_count * sizeof (*header->snap_sizes);
718 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500720 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500721 for (i = 0; i < snap_count; i++)
722 header->snap_sizes[i] =
723 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700724 } else {
Alex Elderccece232012-07-10 20:30:10 -0500725 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726 header->snap_names = NULL;
727 header->snap_sizes = NULL;
728 }
Alex Elder849b4262012-07-09 21:04:24 -0500729
Alex Elder34b13182012-07-13 20:35:12 -0500730 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700731 header->obj_order = ondisk->options.order;
732 header->crypt_type = ondisk->options.crypt_type;
733 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500734
Alex Elder621901d2012-08-23 23:22:06 -0500735 /* Allocate and fill in the snapshot context */
736
Alex Elderf84344f2012-08-31 17:29:51 -0500737 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500738 size = sizeof (struct ceph_snap_context);
739 size += snap_count * sizeof (header->snapc->snaps[0]);
740 header->snapc = kzalloc(size, GFP_KERNEL);
741 if (!header->snapc)
742 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743
744 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500745 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700746 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500747 for (i = 0; i < snap_count; i++)
748 header->snapc->snaps[i] =
749 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750
751 return 0;
752
Alex Elder6a523252012-07-19 17:12:59 -0500753out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500754 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500755 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700756 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500757 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500758 kfree(header->object_prefix);
759 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500760
Alex Elder00f1f362012-02-07 12:03:36 -0600761 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700762}
763
Alex Elder9e15b772012-10-30 19:40:33 -0500764static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
765{
766 struct rbd_snap *snap;
767
768 if (snap_id == CEPH_NOSNAP)
769 return RBD_SNAP_HEAD_NAME;
770
771 list_for_each_entry(snap, &rbd_dev->snaps, node)
772 if (snap_id == snap->id)
773 return snap->name;
774
775 return NULL;
776}
777
Alex Elder8836b992012-08-30 14:42:15 -0500778static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700780
Alex Eldere86924a2012-07-10 20:30:11 -0500781 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600782
Alex Eldere86924a2012-07-10 20:30:11 -0500783 list_for_each_entry(snap, &rbd_dev->snaps, node) {
784 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500785 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500786 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500787 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600788
Alex Eldere86924a2012-07-10 20:30:11 -0500789 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600790 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700791 }
Alex Eldere86924a2012-07-10 20:30:11 -0500792
Alex Elder00f1f362012-02-07 12:03:36 -0600793 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700794}
795
Alex Elder819d52b2012-10-25 23:34:41 -0500796static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700797{
Alex Elder78dc4472012-07-19 08:49:18 -0500798 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700799
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500800 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800801 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500802 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500803 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500804 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500805 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700806 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500807 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700808 if (ret < 0)
809 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500810 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700811 }
Alex Elder6d292902013-01-14 12:43:31 -0600812 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
813
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700814done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700815 return ret;
816}
817
818static void rbd_header_free(struct rbd_image_header *header)
819{
Alex Elder849b4262012-07-09 21:04:24 -0500820 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500821 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700822 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500823 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500824 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500825 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800826 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500827 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700828}
829
Alex Elder98571b52013-01-20 14:44:42 -0600830static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700831{
Alex Elder65ccfe22012-08-09 10:33:26 -0700832 char *name;
833 u64 segment;
834 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700835
Alex Elder2fd82b92012-11-09 15:05:54 -0600836 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700837 if (!name)
838 return NULL;
839 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600840 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700841 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600842 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700843 pr_err("error formatting segment name for #%llu (%d)\n",
844 segment, ret);
845 kfree(name);
846 name = NULL;
847 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700848
Alex Elder65ccfe22012-08-09 10:33:26 -0700849 return name;
850}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700851
Alex Elder65ccfe22012-08-09 10:33:26 -0700852static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
853{
854 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700855
Alex Elder65ccfe22012-08-09 10:33:26 -0700856 return offset & (segment_size - 1);
857}
858
859static u64 rbd_segment_length(struct rbd_device *rbd_dev,
860 u64 offset, u64 length)
861{
862 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
863
864 offset &= segment_size - 1;
865
Alex Elderaafb230e2012-09-06 16:00:54 -0500866 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700867 if (offset + length > segment_size)
868 length = segment_size - offset;
869
870 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700871}
872
873/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700874 * returns the size of an object in the image
875 */
876static u64 rbd_obj_bytes(struct rbd_image_header *header)
877{
878 return 1 << header->obj_order;
879}
880
881/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882 * bio helpers
883 */
884
885static void bio_chain_put(struct bio *chain)
886{
887 struct bio *tmp;
888
889 while (chain) {
890 tmp = chain;
891 chain = chain->bi_next;
892 bio_put(tmp);
893 }
894}
895
896/*
897 * zeros a bio chain, starting at specific offset
898 */
899static void zero_bio_chain(struct bio *chain, int start_ofs)
900{
901 struct bio_vec *bv;
902 unsigned long flags;
903 void *buf;
904 int i;
905 int pos = 0;
906
907 while (chain) {
908 bio_for_each_segment(bv, chain, i) {
909 if (pos + bv->bv_len > start_ofs) {
910 int remainder = max(start_ofs - pos, 0);
911 buf = bvec_kmap_irq(bv, &flags);
912 memset(buf + remainder, 0,
913 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200914 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700915 }
916 pos += bv->bv_len;
917 }
918
919 chain = chain->bi_next;
920 }
921}
922
923/*
Alex Elderf7760da2012-10-20 22:17:27 -0500924 * Clone a portion of a bio, starting at the given byte offset
925 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700926 */
Alex Elderf7760da2012-10-20 22:17:27 -0500927static struct bio *bio_clone_range(struct bio *bio_src,
928 unsigned int offset,
929 unsigned int len,
930 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700931{
Alex Elderf7760da2012-10-20 22:17:27 -0500932 struct bio_vec *bv;
933 unsigned int resid;
934 unsigned short idx;
935 unsigned int voff;
936 unsigned short end_idx;
937 unsigned short vcnt;
938 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700939
Alex Elderf7760da2012-10-20 22:17:27 -0500940 /* Handle the easy case for the caller */
941
942 if (!offset && len == bio_src->bi_size)
943 return bio_clone(bio_src, gfpmask);
944
945 if (WARN_ON_ONCE(!len))
946 return NULL;
947 if (WARN_ON_ONCE(len > bio_src->bi_size))
948 return NULL;
949 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
950 return NULL;
951
952 /* Find first affected segment... */
953
954 resid = offset;
955 __bio_for_each_segment(bv, bio_src, idx, 0) {
956 if (resid < bv->bv_len)
957 break;
958 resid -= bv->bv_len;
959 }
960 voff = resid;
961
962 /* ...and the last affected segment */
963
964 resid += len;
965 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
966 if (resid <= bv->bv_len)
967 break;
968 resid -= bv->bv_len;
969 }
970 vcnt = end_idx - idx + 1;
971
972 /* Build the clone */
973
974 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
975 if (!bio)
976 return NULL; /* ENOMEM */
977
978 bio->bi_bdev = bio_src->bi_bdev;
979 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
980 bio->bi_rw = bio_src->bi_rw;
981 bio->bi_flags |= 1 << BIO_CLONED;
982
983 /*
984 * Copy over our part of the bio_vec, then update the first
985 * and last (or only) entries.
986 */
987 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
988 vcnt * sizeof (struct bio_vec));
989 bio->bi_io_vec[0].bv_offset += voff;
990 if (vcnt > 1) {
991 bio->bi_io_vec[0].bv_len -= voff;
992 bio->bi_io_vec[vcnt - 1].bv_len = resid;
993 } else {
994 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995 }
996
Alex Elderf7760da2012-10-20 22:17:27 -0500997 bio->bi_vcnt = vcnt;
998 bio->bi_size = len;
999 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001000
Alex Elderf7760da2012-10-20 22:17:27 -05001001 return bio;
1002}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001003
Alex Elderf7760da2012-10-20 22:17:27 -05001004/*
1005 * Clone a portion of a bio chain, starting at the given byte offset
1006 * into the first bio in the source chain and continuing for the
1007 * number of bytes indicated. The result is another bio chain of
1008 * exactly the given length, or a null pointer on error.
1009 *
1010 * The bio_src and offset parameters are both in-out. On entry they
1011 * refer to the first source bio and the offset into that bio where
1012 * the start of data to be cloned is located.
1013 *
1014 * On return, bio_src is updated to refer to the bio in the source
1015 * chain that contains first un-cloned byte, and *offset will
1016 * contain the offset of that byte within that bio.
1017 */
1018static struct bio *bio_chain_clone_range(struct bio **bio_src,
1019 unsigned int *offset,
1020 unsigned int len,
1021 gfp_t gfpmask)
1022{
1023 struct bio *bi = *bio_src;
1024 unsigned int off = *offset;
1025 struct bio *chain = NULL;
1026 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001027
Alex Elderf7760da2012-10-20 22:17:27 -05001028 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001029
Alex Elderf7760da2012-10-20 22:17:27 -05001030 if (!bi || off >= bi->bi_size || !len)
1031 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001032
Alex Elderf7760da2012-10-20 22:17:27 -05001033 end = &chain;
1034 while (len) {
1035 unsigned int bi_size;
1036 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001037
Alex Elderf5400b72012-11-01 10:17:15 -05001038 if (!bi) {
1039 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001040 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001041 }
Alex Elderf7760da2012-10-20 22:17:27 -05001042 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1043 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1044 if (!bio)
1045 goto out_err; /* ENOMEM */
1046
1047 *end = bio;
1048 end = &bio->bi_next;
1049
1050 off += bi_size;
1051 if (off == bi->bi_size) {
1052 bi = bi->bi_next;
1053 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054 }
Alex Elderf7760da2012-10-20 22:17:27 -05001055 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056 }
Alex Elderf7760da2012-10-20 22:17:27 -05001057 *bio_src = bi;
1058 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001059
Alex Elderf7760da2012-10-20 22:17:27 -05001060 return chain;
1061out_err:
1062 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001063
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001064 return NULL;
1065}
1066
Alex Elderbf0d5f502012-11-22 00:00:08 -06001067static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1068{
Alex Elder37206ee2013-02-20 17:32:08 -06001069 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1070 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001071 kref_get(&obj_request->kref);
1072}
1073
1074static void rbd_obj_request_destroy(struct kref *kref);
1075static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1076{
1077 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001078 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1079 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001080 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1081}
1082
1083static void rbd_img_request_get(struct rbd_img_request *img_request)
1084{
Alex Elder37206ee2013-02-20 17:32:08 -06001085 dout("%s: img %p (was %d)\n", __func__, img_request,
1086 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001087 kref_get(&img_request->kref);
1088}
1089
1090static void rbd_img_request_destroy(struct kref *kref);
1091static void rbd_img_request_put(struct rbd_img_request *img_request)
1092{
1093 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001094 dout("%s: img %p (was %d)\n", __func__, img_request,
1095 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001096 kref_put(&img_request->kref, rbd_img_request_destroy);
1097}
1098
1099static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1100 struct rbd_obj_request *obj_request)
1101{
Alex Elder25dcf952013-01-25 17:08:55 -06001102 rbd_assert(obj_request->img_request == NULL);
1103
Alex Elderbf0d5f502012-11-22 00:00:08 -06001104 rbd_obj_request_get(obj_request);
1105 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001106 obj_request->which = img_request->obj_request_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001107 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001108 img_request->obj_request_count++;
1109 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001110 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1111 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001112}
1113
1114static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1115 struct rbd_obj_request *obj_request)
1116{
1117 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001118
Alex Elder37206ee2013-02-20 17:32:08 -06001119 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1120 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001121 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001122 rbd_assert(img_request->obj_request_count > 0);
1123 img_request->obj_request_count--;
1124 rbd_assert(obj_request->which == img_request->obj_request_count);
1125 obj_request->which = BAD_WHICH;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001126 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001127 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001128 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001129 rbd_obj_request_put(obj_request);
1130}
1131
1132static bool obj_request_type_valid(enum obj_request_type type)
1133{
1134 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001135 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001136 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001137 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001138 return true;
1139 default:
1140 return false;
1141 }
1142}
1143
Alex Eldercc344fa2013-02-19 12:25:56 -06001144static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
Alex Elder8d23bf22012-11-19 22:55:21 -06001145{
1146 struct ceph_osd_req_op *op;
1147 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001148 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001149
1150 op = kzalloc(sizeof (*op), GFP_NOIO);
1151 if (!op)
1152 return NULL;
1153 op->op = opcode;
1154 va_start(args, opcode);
1155 switch (opcode) {
1156 case CEPH_OSD_OP_READ:
1157 case CEPH_OSD_OP_WRITE:
1158 /* rbd_osd_req_op_create(READ, offset, length) */
1159 /* rbd_osd_req_op_create(WRITE, offset, length) */
1160 op->extent.offset = va_arg(args, u64);
1161 op->extent.length = va_arg(args, u64);
1162 if (opcode == CEPH_OSD_OP_WRITE)
1163 op->payload_len = op->extent.length;
1164 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001165 case CEPH_OSD_OP_STAT:
1166 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001167 case CEPH_OSD_OP_CALL:
1168 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1169 op->cls.class_name = va_arg(args, char *);
1170 size = strlen(op->cls.class_name);
1171 rbd_assert(size <= (size_t) U8_MAX);
1172 op->cls.class_len = size;
1173 op->payload_len = size;
1174
1175 op->cls.method_name = va_arg(args, char *);
1176 size = strlen(op->cls.method_name);
1177 rbd_assert(size <= (size_t) U8_MAX);
1178 op->cls.method_len = size;
1179 op->payload_len += size;
1180
1181 op->cls.argc = 0;
1182 op->cls.indata = va_arg(args, void *);
1183 size = va_arg(args, size_t);
1184 rbd_assert(size <= (size_t) U32_MAX);
1185 op->cls.indata_len = (u32) size;
1186 op->payload_len += size;
1187 break;
Alex Elder5efea492012-11-19 22:55:21 -06001188 case CEPH_OSD_OP_NOTIFY_ACK:
1189 case CEPH_OSD_OP_WATCH:
1190 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1191 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1192 op->watch.cookie = va_arg(args, u64);
1193 op->watch.ver = va_arg(args, u64);
1194 op->watch.ver = cpu_to_le64(op->watch.ver);
1195 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1196 op->watch.flag = (u8) 1;
1197 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001198 default:
1199 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1200 kfree(op);
1201 op = NULL;
1202 break;
1203 }
1204 va_end(args);
1205
1206 return op;
1207}
1208
1209static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1210{
1211 kfree(op);
1212}
1213
Alex Elderbf0d5f502012-11-22 00:00:08 -06001214static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1215 struct rbd_obj_request *obj_request)
1216{
Alex Elder37206ee2013-02-20 17:32:08 -06001217 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1218
Alex Elderbf0d5f502012-11-22 00:00:08 -06001219 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1220}
1221
1222static void rbd_img_request_complete(struct rbd_img_request *img_request)
1223{
Alex Elder37206ee2013-02-20 17:32:08 -06001224 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001225 if (img_request->callback)
1226 img_request->callback(img_request);
1227 else
1228 rbd_img_request_put(img_request);
1229}
1230
Alex Elder788e2df2013-01-17 12:25:27 -06001231/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1232
1233static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1234{
Alex Elder37206ee2013-02-20 17:32:08 -06001235 dout("%s: obj %p\n", __func__, obj_request);
1236
Alex Elder788e2df2013-01-17 12:25:27 -06001237 return wait_for_completion_interruptible(&obj_request->completion);
1238}
1239
Alex Elder07741302013-02-05 23:41:50 -06001240static void obj_request_done_init(struct rbd_obj_request *obj_request)
1241{
1242 atomic_set(&obj_request->done, 0);
1243 smp_wmb();
1244}
1245
1246static void obj_request_done_set(struct rbd_obj_request *obj_request)
1247{
Alex Elder632b88c2013-02-21 10:10:06 -06001248 int done;
1249
1250 done = atomic_inc_return(&obj_request->done);
1251 if (done > 1) {
1252 struct rbd_img_request *img_request = obj_request->img_request;
1253 struct rbd_device *rbd_dev;
1254
1255 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1256 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1257 obj_request);
1258 }
Alex Elder07741302013-02-05 23:41:50 -06001259}
1260
1261static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1262{
Alex Elder632b88c2013-02-21 10:10:06 -06001263 smp_mb();
Alex Elder07741302013-02-05 23:41:50 -06001264 return atomic_read(&obj_request->done) != 0;
1265}
1266
Alex Elder6e2a4502013-03-27 09:16:30 -05001267static void
1268rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1269{
1270 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1271 obj_request, obj_request->img_request, obj_request->result,
1272 obj_request->xferred, obj_request->length);
1273 /*
1274 * ENOENT means a hole in the image. We zero-fill the
1275 * entire length of the request. A short read also implies
1276 * zero-fill to the end of the request. Either way we
1277 * update the xferred count to indicate the whole request
1278 * was satisfied.
1279 */
1280 BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
1281 if (obj_request->result == -ENOENT) {
1282 zero_bio_chain(obj_request->bio_list, 0);
1283 obj_request->result = 0;
1284 obj_request->xferred = obj_request->length;
1285 } else if (obj_request->xferred < obj_request->length &&
1286 !obj_request->result) {
1287 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1288 obj_request->xferred = obj_request->length;
1289 }
1290 obj_request_done_set(obj_request);
1291}
1292
Alex Elderbf0d5f502012-11-22 00:00:08 -06001293static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1294{
Alex Elder37206ee2013-02-20 17:32:08 -06001295 dout("%s: obj %p cb %p\n", __func__, obj_request,
1296 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001297 if (obj_request->callback)
1298 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001299 else
1300 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001301}
1302
Alex Elderc47f9372013-02-26 14:23:07 -06001303static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001304{
1305 dout("%s: obj %p\n", __func__, obj_request);
1306 obj_request_done_set(obj_request);
1307}
1308
Alex Elderc47f9372013-02-26 14:23:07 -06001309static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001310{
Alex Elder37206ee2013-02-20 17:32:08 -06001311 dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
Alex Elderc47f9372013-02-26 14:23:07 -06001312 obj_request->result, obj_request->xferred, obj_request->length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001313 if (obj_request->img_request)
1314 rbd_img_obj_request_read_callback(obj_request);
1315 else
1316 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001317}
1318
Alex Elderc47f9372013-02-26 14:23:07 -06001319static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001320{
Sage Weil1b83bef2013-02-25 16:11:12 -08001321 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1322 obj_request->result, obj_request->length);
1323 /*
1324 * There is no such thing as a successful short write.
1325 * Our xferred value is the number of bytes transferred
1326 * back. Set it to our originally-requested length.
1327 */
1328 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001329 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001330}
1331
Alex Elderfbfab532013-02-08 09:55:48 -06001332/*
1333 * For a simple stat call there's nothing to do. We'll do more if
1334 * this is part of a write sequence for a layered image.
1335 */
Alex Elderc47f9372013-02-26 14:23:07 -06001336static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001337{
Alex Elder37206ee2013-02-20 17:32:08 -06001338 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001339 obj_request_done_set(obj_request);
1340}
1341
Alex Elderbf0d5f502012-11-22 00:00:08 -06001342static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1343 struct ceph_msg *msg)
1344{
1345 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001346 u16 opcode;
1347
Alex Elder37206ee2013-02-20 17:32:08 -06001348 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001349 rbd_assert(osd_req == obj_request->osd_req);
1350 rbd_assert(!!obj_request->img_request ^
1351 (obj_request->which == BAD_WHICH));
1352
Sage Weil1b83bef2013-02-25 16:11:12 -08001353 if (osd_req->r_result < 0)
1354 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001355 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1356
Sage Weil1b83bef2013-02-25 16:11:12 -08001357 WARN_ON(osd_req->r_num_ops != 1); /* For now */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001358
Alex Elderc47f9372013-02-26 14:23:07 -06001359 /*
1360 * We support a 64-bit length, but ultimately it has to be
1361 * passed to blk_end_request(), which takes an unsigned int.
1362 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001363 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elderc47f9372013-02-26 14:23:07 -06001364 rbd_assert(obj_request->xferred < (u64) UINT_MAX);
Sage Weil1b83bef2013-02-25 16:11:12 -08001365 opcode = osd_req->r_request_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001366 switch (opcode) {
1367 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001368 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001369 break;
1370 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001371 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001372 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001373 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001374 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001375 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001376 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001377 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001378 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001379 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001380 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001381 default:
1382 rbd_warn(NULL, "%s: unsupported op %hu\n",
1383 obj_request->object_name, (unsigned short) opcode);
1384 break;
1385 }
1386
Alex Elder07741302013-02-05 23:41:50 -06001387 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001388 rbd_obj_request_complete(obj_request);
1389}
1390
1391static struct ceph_osd_request *rbd_osd_req_create(
1392 struct rbd_device *rbd_dev,
1393 bool write_request,
1394 struct rbd_obj_request *obj_request,
1395 struct ceph_osd_req_op *op)
1396{
1397 struct rbd_img_request *img_request = obj_request->img_request;
1398 struct ceph_snap_context *snapc = NULL;
1399 struct ceph_osd_client *osdc;
1400 struct ceph_osd_request *osd_req;
1401 struct timespec now;
1402 struct timespec *mtime;
1403 u64 snap_id = CEPH_NOSNAP;
1404 u64 offset = obj_request->offset;
1405 u64 length = obj_request->length;
1406
1407 if (img_request) {
1408 rbd_assert(img_request->write_request == write_request);
1409 if (img_request->write_request)
1410 snapc = img_request->snapc;
1411 else
1412 snap_id = img_request->snap_id;
1413 }
1414
1415 /* Allocate and initialize the request, for the single op */
1416
1417 osdc = &rbd_dev->rbd_client->client->osdc;
1418 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1419 if (!osd_req)
1420 return NULL; /* ENOMEM */
1421
1422 rbd_assert(obj_request_type_valid(obj_request->type));
1423 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001424 case OBJ_REQUEST_NODATA:
1425 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001426 case OBJ_REQUEST_BIO:
1427 rbd_assert(obj_request->bio_list != NULL);
1428 osd_req->r_bio = obj_request->bio_list;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001429 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001430 case OBJ_REQUEST_PAGES:
1431 osd_req->r_pages = obj_request->pages;
1432 osd_req->r_num_pages = obj_request->page_count;
1433 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1434 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001435 }
1436
1437 if (write_request) {
1438 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1439 now = CURRENT_TIME;
1440 mtime = &now;
1441 } else {
1442 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1443 mtime = NULL; /* not needed for reads */
1444 offset = 0; /* These are not used... */
1445 length = 0; /* ...for osd read requests */
1446 }
1447
1448 osd_req->r_callback = rbd_osd_req_callback;
1449 osd_req->r_priv = obj_request;
1450
1451 osd_req->r_oid_len = strlen(obj_request->object_name);
1452 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1453 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1454
1455 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1456
1457 /* osd_req will get its own reference to snapc (if non-null) */
1458
1459 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1460 snapc, snap_id, mtime);
1461
1462 return osd_req;
1463}
1464
1465static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1466{
1467 ceph_osdc_put_request(osd_req);
1468}
1469
1470/* object_name is assumed to be a non-null pointer and NUL-terminated */
1471
1472static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1473 u64 offset, u64 length,
1474 enum obj_request_type type)
1475{
1476 struct rbd_obj_request *obj_request;
1477 size_t size;
1478 char *name;
1479
1480 rbd_assert(obj_request_type_valid(type));
1481
1482 size = strlen(object_name) + 1;
1483 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1484 if (!obj_request)
1485 return NULL;
1486
1487 name = (char *)(obj_request + 1);
1488 obj_request->object_name = memcpy(name, object_name, size);
1489 obj_request->offset = offset;
1490 obj_request->length = length;
1491 obj_request->which = BAD_WHICH;
1492 obj_request->type = type;
1493 INIT_LIST_HEAD(&obj_request->links);
Alex Elder07741302013-02-05 23:41:50 -06001494 obj_request_done_init(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001495 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001496 kref_init(&obj_request->kref);
1497
Alex Elder37206ee2013-02-20 17:32:08 -06001498 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1499 offset, length, (int)type, obj_request);
1500
Alex Elderbf0d5f502012-11-22 00:00:08 -06001501 return obj_request;
1502}
1503
1504static void rbd_obj_request_destroy(struct kref *kref)
1505{
1506 struct rbd_obj_request *obj_request;
1507
1508 obj_request = container_of(kref, struct rbd_obj_request, kref);
1509
Alex Elder37206ee2013-02-20 17:32:08 -06001510 dout("%s: obj %p\n", __func__, obj_request);
1511
Alex Elderbf0d5f502012-11-22 00:00:08 -06001512 rbd_assert(obj_request->img_request == NULL);
1513 rbd_assert(obj_request->which == BAD_WHICH);
1514
1515 if (obj_request->osd_req)
1516 rbd_osd_req_destroy(obj_request->osd_req);
1517
1518 rbd_assert(obj_request_type_valid(obj_request->type));
1519 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001520 case OBJ_REQUEST_NODATA:
1521 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001522 case OBJ_REQUEST_BIO:
1523 if (obj_request->bio_list)
1524 bio_chain_put(obj_request->bio_list);
1525 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001526 case OBJ_REQUEST_PAGES:
1527 if (obj_request->pages)
1528 ceph_release_page_vector(obj_request->pages,
1529 obj_request->page_count);
1530 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001531 }
1532
1533 kfree(obj_request);
1534}
1535
1536/*
1537 * Caller is responsible for filling in the list of object requests
1538 * that comprises the image request, and the Linux request pointer
1539 * (if there is one).
1540 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001541static struct rbd_img_request *rbd_img_request_create(
1542 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001543 u64 offset, u64 length,
1544 bool write_request)
1545{
1546 struct rbd_img_request *img_request;
1547 struct ceph_snap_context *snapc = NULL;
1548
1549 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1550 if (!img_request)
1551 return NULL;
1552
1553 if (write_request) {
1554 down_read(&rbd_dev->header_rwsem);
1555 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1556 up_read(&rbd_dev->header_rwsem);
1557 if (WARN_ON(!snapc)) {
1558 kfree(img_request);
1559 return NULL; /* Shouldn't happen */
1560 }
1561 }
1562
1563 img_request->rq = NULL;
1564 img_request->rbd_dev = rbd_dev;
1565 img_request->offset = offset;
1566 img_request->length = length;
1567 img_request->write_request = write_request;
1568 if (write_request)
1569 img_request->snapc = snapc;
1570 else
1571 img_request->snap_id = rbd_dev->spec->snap_id;
1572 spin_lock_init(&img_request->completion_lock);
1573 img_request->next_completion = 0;
1574 img_request->callback = NULL;
1575 img_request->obj_request_count = 0;
1576 INIT_LIST_HEAD(&img_request->obj_requests);
1577 kref_init(&img_request->kref);
1578
1579 rbd_img_request_get(img_request); /* Avoid a warning */
1580 rbd_img_request_put(img_request); /* TEMPORARY */
1581
Alex Elder37206ee2013-02-20 17:32:08 -06001582 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1583 write_request ? "write" : "read", offset, length,
1584 img_request);
1585
Alex Elderbf0d5f502012-11-22 00:00:08 -06001586 return img_request;
1587}
1588
1589static void rbd_img_request_destroy(struct kref *kref)
1590{
1591 struct rbd_img_request *img_request;
1592 struct rbd_obj_request *obj_request;
1593 struct rbd_obj_request *next_obj_request;
1594
1595 img_request = container_of(kref, struct rbd_img_request, kref);
1596
Alex Elder37206ee2013-02-20 17:32:08 -06001597 dout("%s: img %p\n", __func__, img_request);
1598
Alex Elderbf0d5f502012-11-22 00:00:08 -06001599 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1600 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001601 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001602
1603 if (img_request->write_request)
1604 ceph_put_snap_context(img_request->snapc);
1605
1606 kfree(img_request);
1607}
1608
1609static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1610 struct bio *bio_list)
1611{
1612 struct rbd_device *rbd_dev = img_request->rbd_dev;
1613 struct rbd_obj_request *obj_request = NULL;
1614 struct rbd_obj_request *next_obj_request;
1615 unsigned int bio_offset;
1616 u64 image_offset;
1617 u64 resid;
1618 u16 opcode;
1619
Alex Elder37206ee2013-02-20 17:32:08 -06001620 dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1621
Alex Elderbf0d5f502012-11-22 00:00:08 -06001622 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1623 : CEPH_OSD_OP_READ;
1624 bio_offset = 0;
1625 image_offset = img_request->offset;
1626 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1627 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06001628 rbd_assert(resid > 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001629 while (resid) {
1630 const char *object_name;
1631 unsigned int clone_size;
1632 struct ceph_osd_req_op *op;
1633 u64 offset;
1634 u64 length;
1635
1636 object_name = rbd_segment_name(rbd_dev, image_offset);
1637 if (!object_name)
1638 goto out_unwind;
1639 offset = rbd_segment_offset(rbd_dev, image_offset);
1640 length = rbd_segment_length(rbd_dev, image_offset, resid);
1641 obj_request = rbd_obj_request_create(object_name,
1642 offset, length,
1643 OBJ_REQUEST_BIO);
1644 kfree(object_name); /* object request has its own copy */
1645 if (!obj_request)
1646 goto out_unwind;
1647
1648 rbd_assert(length <= (u64) UINT_MAX);
1649 clone_size = (unsigned int) length;
1650 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1651 &bio_offset, clone_size,
1652 GFP_ATOMIC);
1653 if (!obj_request->bio_list)
1654 goto out_partial;
1655
1656 /*
1657 * Build up the op to use in building the osd
1658 * request. Note that the contents of the op are
1659 * copied by rbd_osd_req_create().
1660 */
1661 op = rbd_osd_req_op_create(opcode, offset, length);
1662 if (!op)
1663 goto out_partial;
1664 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1665 img_request->write_request,
1666 obj_request, op);
1667 rbd_osd_req_op_destroy(op);
1668 if (!obj_request->osd_req)
1669 goto out_partial;
1670 /* status and version are initially zero-filled */
1671
1672 rbd_img_obj_request_add(img_request, obj_request);
1673
1674 image_offset += length;
1675 resid -= length;
1676 }
1677
1678 return 0;
1679
1680out_partial:
1681 rbd_obj_request_put(obj_request);
1682out_unwind:
1683 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1684 rbd_obj_request_put(obj_request);
1685
1686 return -ENOMEM;
1687}
1688
1689static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1690{
1691 struct rbd_img_request *img_request;
1692 u32 which = obj_request->which;
1693 bool more = true;
1694
1695 img_request = obj_request->img_request;
Alex Elder4dda41d2013-02-20 21:59:33 -06001696
Alex Elder37206ee2013-02-20 17:32:08 -06001697 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001698 rbd_assert(img_request != NULL);
1699 rbd_assert(img_request->rq != NULL);
Alex Elder4dda41d2013-02-20 21:59:33 -06001700 rbd_assert(img_request->obj_request_count > 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001701 rbd_assert(which != BAD_WHICH);
1702 rbd_assert(which < img_request->obj_request_count);
1703 rbd_assert(which >= img_request->next_completion);
1704
1705 spin_lock_irq(&img_request->completion_lock);
1706 if (which != img_request->next_completion)
1707 goto out;
1708
1709 for_each_obj_request_from(img_request, obj_request) {
1710 unsigned int xferred;
1711 int result;
1712
1713 rbd_assert(more);
1714 rbd_assert(which < img_request->obj_request_count);
1715
Alex Elder07741302013-02-05 23:41:50 -06001716 if (!obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001717 break;
1718
1719 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1720 xferred = (unsigned int) obj_request->xferred;
1721 result = (int) obj_request->result;
1722 if (result)
1723 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1724 img_request->write_request ? "write" : "read",
1725 result, xferred);
1726
1727 more = blk_end_request(img_request->rq, result, xferred);
1728 which++;
1729 }
Sage Weil1b83bef2013-02-25 16:11:12 -08001730
Alex Elderbf0d5f502012-11-22 00:00:08 -06001731 rbd_assert(more ^ (which == img_request->obj_request_count));
1732 img_request->next_completion = which;
1733out:
1734 spin_unlock_irq(&img_request->completion_lock);
1735
1736 if (!more)
1737 rbd_img_request_complete(img_request);
1738}
1739
1740static int rbd_img_request_submit(struct rbd_img_request *img_request)
1741{
1742 struct rbd_device *rbd_dev = img_request->rbd_dev;
1743 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1744 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05001745 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001746
Alex Elder37206ee2013-02-20 17:32:08 -06001747 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05001748 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001749 int ret;
1750
1751 obj_request->callback = rbd_img_obj_callback;
1752 ret = rbd_obj_request_submit(osdc, obj_request);
1753 if (ret)
1754 return ret;
1755 /*
1756 * The image request has its own reference to each
1757 * of its object requests, so we can safely drop the
1758 * initial one here.
1759 */
1760 rbd_obj_request_put(obj_request);
1761 }
1762
1763 return 0;
1764}
1765
Alex Eldercf81b602013-01-17 12:18:46 -06001766static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06001767 u64 ver, u64 notify_id)
1768{
1769 struct rbd_obj_request *obj_request;
1770 struct ceph_osd_req_op *op;
1771 struct ceph_osd_client *osdc;
1772 int ret;
1773
1774 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1775 OBJ_REQUEST_NODATA);
1776 if (!obj_request)
1777 return -ENOMEM;
1778
1779 ret = -ENOMEM;
1780 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1781 if (!op)
1782 goto out;
1783 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1784 obj_request, op);
1785 rbd_osd_req_op_destroy(op);
1786 if (!obj_request->osd_req)
1787 goto out;
1788
1789 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Eldercf81b602013-01-17 12:18:46 -06001790 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06001791 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001792out:
Alex Eldercf81b602013-01-17 12:18:46 -06001793 if (ret)
1794 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001795
1796 return ret;
1797}
1798
1799static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1800{
1801 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1802 u64 hver;
1803 int rc;
1804
1805 if (!rbd_dev)
1806 return;
1807
Alex Elder37206ee2013-02-20 17:32:08 -06001808 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Elderb8d70032012-11-30 17:53:04 -06001809 rbd_dev->header_name, (unsigned long long) notify_id,
1810 (unsigned int) opcode);
1811 rc = rbd_dev_refresh(rbd_dev, &hver);
1812 if (rc)
1813 rbd_warn(rbd_dev, "got notification but failed to "
1814 " update snaps: %d\n", rc);
1815
Alex Eldercf81b602013-01-17 12:18:46 -06001816 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06001817}
1818
Alex Elder9969ebc2013-01-18 12:31:10 -06001819/*
1820 * Request sync osd watch/unwatch. The value of "start" determines
1821 * whether a watch request is being initiated or torn down.
1822 */
1823static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1824{
1825 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1826 struct rbd_obj_request *obj_request;
1827 struct ceph_osd_req_op *op;
1828 int ret;
1829
1830 rbd_assert(start ^ !!rbd_dev->watch_event);
1831 rbd_assert(start ^ !!rbd_dev->watch_request);
1832
1833 if (start) {
Alex Elder3c663bb2013-02-15 11:42:30 -06001834 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
Alex Elder9969ebc2013-01-18 12:31:10 -06001835 &rbd_dev->watch_event);
1836 if (ret < 0)
1837 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06001838 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06001839 }
1840
1841 ret = -ENOMEM;
1842 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1843 OBJ_REQUEST_NODATA);
1844 if (!obj_request)
1845 goto out_cancel;
1846
1847 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1848 rbd_dev->watch_event->cookie,
1849 rbd_dev->header.obj_version, start);
1850 if (!op)
1851 goto out_cancel;
1852 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1853 obj_request, op);
1854 rbd_osd_req_op_destroy(op);
1855 if (!obj_request->osd_req)
1856 goto out_cancel;
1857
Alex Elder8eb87562013-01-25 17:08:55 -06001858 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06001859 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06001860 else
Alex Elder6977c3f2013-01-25 17:08:55 -06001861 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06001862 rbd_dev->watch_request->osd_req);
Alex Elder9969ebc2013-01-18 12:31:10 -06001863 ret = rbd_obj_request_submit(osdc, obj_request);
1864 if (ret)
1865 goto out_cancel;
1866 ret = rbd_obj_request_wait(obj_request);
1867 if (ret)
1868 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06001869 ret = obj_request->result;
1870 if (ret)
1871 goto out_cancel;
1872
Alex Elder8eb87562013-01-25 17:08:55 -06001873 /*
1874 * A watch request is set to linger, so the underlying osd
1875 * request won't go away until we unregister it. We retain
1876 * a pointer to the object request during that time (in
1877 * rbd_dev->watch_request), so we'll keep a reference to
1878 * it. We'll drop that reference (below) after we've
1879 * unregistered it.
1880 */
1881 if (start) {
1882 rbd_dev->watch_request = obj_request;
1883
1884 return 0;
1885 }
1886
1887 /* We have successfully torn down the watch request */
1888
1889 rbd_obj_request_put(rbd_dev->watch_request);
1890 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06001891out_cancel:
1892 /* Cancel the event if we're tearing down, or on error */
1893 ceph_osdc_cancel_event(rbd_dev->watch_event);
1894 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06001895 if (obj_request)
1896 rbd_obj_request_put(obj_request);
1897
1898 return ret;
1899}
1900
Alex Elder36be9a72013-01-19 00:30:28 -06001901/*
1902 * Synchronous osd object method call
1903 */
1904static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1905 const char *object_name,
1906 const char *class_name,
1907 const char *method_name,
1908 const char *outbound,
1909 size_t outbound_size,
1910 char *inbound,
1911 size_t inbound_size,
1912 u64 *version)
1913{
1914 struct rbd_obj_request *obj_request;
1915 struct ceph_osd_client *osdc;
1916 struct ceph_osd_req_op *op;
1917 struct page **pages;
1918 u32 page_count;
1919 int ret;
1920
1921 /*
1922 * Method calls are ultimately read operations but they
1923 * don't involve object data (so no offset or length).
1924 * The result should placed into the inbound buffer
1925 * provided. They also supply outbound data--parameters for
1926 * the object method. Currently if this is present it will
1927 * be a snapshot id.
1928 */
1929 page_count = (u32) calc_pages_for(0, inbound_size);
1930 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1931 if (IS_ERR(pages))
1932 return PTR_ERR(pages);
1933
1934 ret = -ENOMEM;
1935 obj_request = rbd_obj_request_create(object_name, 0, 0,
1936 OBJ_REQUEST_PAGES);
1937 if (!obj_request)
1938 goto out;
1939
1940 obj_request->pages = pages;
1941 obj_request->page_count = page_count;
1942
1943 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1944 method_name, outbound, outbound_size);
1945 if (!op)
1946 goto out;
1947 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1948 obj_request, op);
1949 rbd_osd_req_op_destroy(op);
1950 if (!obj_request->osd_req)
1951 goto out;
1952
1953 osdc = &rbd_dev->rbd_client->client->osdc;
1954 ret = rbd_obj_request_submit(osdc, obj_request);
1955 if (ret)
1956 goto out;
1957 ret = rbd_obj_request_wait(obj_request);
1958 if (ret)
1959 goto out;
1960
1961 ret = obj_request->result;
1962 if (ret < 0)
1963 goto out;
Alex Elder23ed6e12013-02-06 13:11:38 -06001964 ret = 0;
Alex Elder903bb322013-02-06 13:11:38 -06001965 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06001966 if (version)
1967 *version = obj_request->version;
1968out:
1969 if (obj_request)
1970 rbd_obj_request_put(obj_request);
1971 else
1972 ceph_release_page_vector(pages, page_count);
1973
1974 return ret;
1975}
1976
Alex Elderbf0d5f502012-11-22 00:00:08 -06001977static void rbd_request_fn(struct request_queue *q)
Alex Eldercc344fa2013-02-19 12:25:56 -06001978 __releases(q->queue_lock) __acquires(q->queue_lock)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001979{
1980 struct rbd_device *rbd_dev = q->queuedata;
1981 bool read_only = rbd_dev->mapping.read_only;
1982 struct request *rq;
1983 int result;
1984
1985 while ((rq = blk_fetch_request(q))) {
1986 bool write_request = rq_data_dir(rq) == WRITE;
1987 struct rbd_img_request *img_request;
1988 u64 offset;
1989 u64 length;
1990
1991 /* Ignore any non-FS requests that filter through. */
1992
1993 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06001994 dout("%s: non-fs request type %d\n", __func__,
1995 (int) rq->cmd_type);
1996 __blk_end_request_all(rq, 0);
1997 continue;
1998 }
1999
2000 /* Ignore/skip any zero-length requests */
2001
2002 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2003 length = (u64) blk_rq_bytes(rq);
2004
2005 if (!length) {
2006 dout("%s: zero-length request\n", __func__);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002007 __blk_end_request_all(rq, 0);
2008 continue;
2009 }
2010
2011 spin_unlock_irq(q->queue_lock);
2012
2013 /* Disallow writes to a read-only device */
2014
2015 if (write_request) {
2016 result = -EROFS;
2017 if (read_only)
2018 goto end_request;
2019 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2020 }
2021
Alex Elder6d292902013-01-14 12:43:31 -06002022 /*
2023 * Quit early if the mapped snapshot no longer
2024 * exists. It's still possible the snapshot will
2025 * have disappeared by the time our request arrives
2026 * at the osd, but there's no sense in sending it if
2027 * we already know.
2028 */
2029 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002030 dout("request for non-existent snapshot");
2031 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2032 result = -ENXIO;
2033 goto end_request;
2034 }
2035
Alex Elderbf0d5f502012-11-22 00:00:08 -06002036 result = -EINVAL;
2037 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2038 goto end_request; /* Shouldn't happen */
2039
2040 result = -ENOMEM;
2041 img_request = rbd_img_request_create(rbd_dev, offset, length,
2042 write_request);
2043 if (!img_request)
2044 goto end_request;
2045
2046 img_request->rq = rq;
2047
2048 result = rbd_img_request_fill_bio(img_request, rq->bio);
2049 if (!result)
2050 result = rbd_img_request_submit(img_request);
2051 if (result)
2052 rbd_img_request_put(img_request);
2053end_request:
2054 spin_lock_irq(q->queue_lock);
2055 if (result < 0) {
2056 rbd_warn(rbd_dev, "obj_request %s result %d\n",
2057 write_request ? "write" : "read", result);
2058 __blk_end_request_all(rq, result);
2059 }
2060 }
2061}
2062
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002063/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002064 * a queue callback. Makes sure that we don't create a bio that spans across
2065 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002066 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002067 */
2068static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2069 struct bio_vec *bvec)
2070{
2071 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05002072 sector_t sector_offset;
2073 sector_t sectors_per_obj;
2074 sector_t obj_sector_offset;
2075 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002076
Alex Eldere5cfeed22012-10-20 22:17:27 -05002077 /*
2078 * Find how far into its rbd object the partition-relative
2079 * bio start sector is to offset relative to the enclosing
2080 * device.
2081 */
2082 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2083 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2084 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002085
Alex Eldere5cfeed22012-10-20 22:17:27 -05002086 /*
2087 * Compute the number of bytes from that offset to the end
2088 * of the object. Account for what's already used by the bio.
2089 */
2090 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2091 if (ret > bmd->bi_size)
2092 ret -= bmd->bi_size;
2093 else
2094 ret = 0;
2095
2096 /*
2097 * Don't send back more than was asked for. And if the bio
2098 * was empty, let the whole thing through because: "Note
2099 * that a block device *must* allow a single page to be
2100 * added to an empty bio."
2101 */
2102 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2103 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2104 ret = (int) bvec->bv_len;
2105
2106 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002107}
2108
2109static void rbd_free_disk(struct rbd_device *rbd_dev)
2110{
2111 struct gendisk *disk = rbd_dev->disk;
2112
2113 if (!disk)
2114 return;
2115
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002116 if (disk->flags & GENHD_FL_UP)
2117 del_gendisk(disk);
2118 if (disk->queue)
2119 blk_cleanup_queue(disk->queue);
2120 put_disk(disk);
2121}
2122
Alex Elder788e2df2013-01-17 12:25:27 -06002123static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2124 const char *object_name,
2125 u64 offset, u64 length,
2126 char *buf, u64 *version)
2127
2128{
2129 struct ceph_osd_req_op *op;
2130 struct rbd_obj_request *obj_request;
2131 struct ceph_osd_client *osdc;
2132 struct page **pages = NULL;
2133 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002134 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06002135 int ret;
2136
2137 page_count = (u32) calc_pages_for(offset, length);
2138 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2139 if (IS_ERR(pages))
2140 ret = PTR_ERR(pages);
2141
2142 ret = -ENOMEM;
2143 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002144 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002145 if (!obj_request)
2146 goto out;
2147
2148 obj_request->pages = pages;
2149 obj_request->page_count = page_count;
2150
2151 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2152 if (!op)
2153 goto out;
2154 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2155 obj_request, op);
2156 rbd_osd_req_op_destroy(op);
2157 if (!obj_request->osd_req)
2158 goto out;
2159
2160 osdc = &rbd_dev->rbd_client->client->osdc;
2161 ret = rbd_obj_request_submit(osdc, obj_request);
2162 if (ret)
2163 goto out;
2164 ret = rbd_obj_request_wait(obj_request);
2165 if (ret)
2166 goto out;
2167
2168 ret = obj_request->result;
2169 if (ret < 0)
2170 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002171
2172 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2173 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002174 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder23ed6e12013-02-06 13:11:38 -06002175 rbd_assert(size <= (size_t) INT_MAX);
2176 ret = (int) size;
Alex Elder788e2df2013-01-17 12:25:27 -06002177 if (version)
2178 *version = obj_request->version;
2179out:
2180 if (obj_request)
2181 rbd_obj_request_put(obj_request);
2182 else
2183 ceph_release_page_vector(pages, page_count);
2184
2185 return ret;
2186}
2187
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002188/*
Alex Elder4156d992012-08-02 11:29:46 -05002189 * Read the complete header for the given rbd device.
2190 *
2191 * Returns a pointer to a dynamically-allocated buffer containing
2192 * the complete and validated header. Caller can pass the address
2193 * of a variable that will be filled in with the version of the
2194 * header object at the time it was read.
2195 *
2196 * Returns a pointer-coded errno if a failure occurs.
2197 */
2198static struct rbd_image_header_ondisk *
2199rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2200{
2201 struct rbd_image_header_ondisk *ondisk = NULL;
2202 u32 snap_count = 0;
2203 u64 names_size = 0;
2204 u32 want_count;
2205 int ret;
2206
2207 /*
2208 * The complete header will include an array of its 64-bit
2209 * snapshot ids, followed by the names of those snapshots as
2210 * a contiguous block of NUL-terminated strings. Note that
2211 * the number of snapshots could change by the time we read
2212 * it in, in which case we re-read it.
2213 */
2214 do {
2215 size_t size;
2216
2217 kfree(ondisk);
2218
2219 size = sizeof (*ondisk);
2220 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2221 size += names_size;
2222 ondisk = kmalloc(size, GFP_KERNEL);
2223 if (!ondisk)
2224 return ERR_PTR(-ENOMEM);
2225
Alex Elder788e2df2013-01-17 12:25:27 -06002226 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002227 0, size,
2228 (char *) ondisk, version);
Alex Elder4156d992012-08-02 11:29:46 -05002229 if (ret < 0)
2230 goto out_err;
2231 if (WARN_ON((size_t) ret < size)) {
2232 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002233 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2234 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002235 goto out_err;
2236 }
2237 if (!rbd_dev_ondisk_valid(ondisk)) {
2238 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002239 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002240 goto out_err;
2241 }
2242
2243 names_size = le64_to_cpu(ondisk->snap_names_len);
2244 want_count = snap_count;
2245 snap_count = le32_to_cpu(ondisk->snap_count);
2246 } while (snap_count != want_count);
2247
2248 return ondisk;
2249
2250out_err:
2251 kfree(ondisk);
2252
2253 return ERR_PTR(ret);
2254}
2255
2256/*
2257 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002258 */
2259static int rbd_read_header(struct rbd_device *rbd_dev,
2260 struct rbd_image_header *header)
2261{
Alex Elder4156d992012-08-02 11:29:46 -05002262 struct rbd_image_header_ondisk *ondisk;
2263 u64 ver = 0;
2264 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002265
Alex Elder4156d992012-08-02 11:29:46 -05002266 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2267 if (IS_ERR(ondisk))
2268 return PTR_ERR(ondisk);
2269 ret = rbd_header_from_disk(header, ondisk);
2270 if (ret >= 0)
2271 header->obj_version = ver;
2272 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002273
Alex Elder4156d992012-08-02 11:29:46 -05002274 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002275}
2276
Alex Elder41f38c22012-10-25 23:34:40 -05002277static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002278{
2279 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002280 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002281
Alex Eldera0593292012-07-19 09:09:27 -05002282 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002283 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002284}
2285
Alex Elder94785542012-10-09 13:50:17 -07002286static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2287{
2288 sector_t size;
2289
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002290 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002291 return;
2292
2293 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2294 dout("setting size to %llu sectors", (unsigned long long) size);
2295 rbd_dev->mapping.size = (u64) size;
2296 set_capacity(rbd_dev->disk, size);
2297}
2298
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002299/*
2300 * only read the first part of the ondisk header, without the snaps info
2301 */
Alex Elder117973f2012-08-31 17:29:55 -05002302static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002303{
2304 int ret;
2305 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002306
2307 ret = rbd_read_header(rbd_dev, &h);
2308 if (ret < 0)
2309 return ret;
2310
Josh Durgina51aa0c2011-12-05 10:35:04 -08002311 down_write(&rbd_dev->header_rwsem);
2312
Alex Elder94785542012-10-09 13:50:17 -07002313 /* Update image size, and check for resize of mapped image */
2314 rbd_dev->header.image_size = h.image_size;
2315 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07002316
Alex Elder849b4262012-07-09 21:04:24 -05002317 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002318 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05002319 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08002320 /* osd requests may still refer to snapc */
2321 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002322
Alex Elderb8136232012-07-25 09:32:41 -05002323 if (hver)
2324 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08002325 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08002326 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002327 rbd_dev->header.snapc = h.snapc;
2328 rbd_dev->header.snap_names = h.snap_names;
2329 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05002330 /* Free the extra copy of the object prefix */
2331 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2332 kfree(h.object_prefix);
2333
Alex Elder304f6802012-08-31 17:29:52 -05002334 ret = rbd_dev_snaps_update(rbd_dev);
2335 if (!ret)
2336 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002337
Josh Durginc6666012011-11-21 17:11:12 -08002338 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002339
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002340 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002341}
2342
Alex Elder117973f2012-08-31 17:29:55 -05002343static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05002344{
2345 int ret;
2346
Alex Elder117973f2012-08-31 17:29:55 -05002347 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05002348 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05002349 if (rbd_dev->image_format == 1)
2350 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2351 else
2352 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05002353 mutex_unlock(&ctl_mutex);
2354
2355 return ret;
2356}
2357
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002358static int rbd_init_disk(struct rbd_device *rbd_dev)
2359{
2360 struct gendisk *disk;
2361 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06002362 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002363
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002364 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002365 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2366 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002367 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002368
Alex Elderf0f8cef2012-01-29 13:57:44 -06002369 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05002370 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002371 disk->major = rbd_dev->major;
2372 disk->first_minor = 0;
2373 disk->fops = &rbd_bd_ops;
2374 disk->private_data = rbd_dev;
2375
Alex Elderbf0d5f502012-11-22 00:00:08 -06002376 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002377 if (!q)
2378 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07002379
Alex Elder593a9e72012-02-07 12:03:37 -06002380 /* We use the default size, but let's be explicit about it. */
2381 blk_queue_physical_block_size(q, SECTOR_SIZE);
2382
Josh Durgin029bcbd2011-07-22 11:35:23 -07002383 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06002384 segment_size = rbd_obj_bytes(&rbd_dev->header);
2385 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2386 blk_queue_max_segment_size(q, segment_size);
2387 blk_queue_io_min(q, segment_size);
2388 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07002389
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002390 blk_queue_merge_bvec(q, rbd_merge_bvec);
2391 disk->queue = q;
2392
2393 q->queuedata = rbd_dev;
2394
2395 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002396
Alex Elder12f02942012-08-29 17:11:07 -05002397 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2398
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002399 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002400out_disk:
2401 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002402
2403 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002404}
2405
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002406/*
2407 sysfs
2408*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002409
Alex Elder593a9e72012-02-07 12:03:37 -06002410static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2411{
2412 return container_of(dev, struct rbd_device, dev);
2413}
2414
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002415static ssize_t rbd_size_show(struct device *dev,
2416 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002417{
Alex Elder593a9e72012-02-07 12:03:37 -06002418 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08002419 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002420
Josh Durgina51aa0c2011-12-05 10:35:04 -08002421 down_read(&rbd_dev->header_rwsem);
2422 size = get_capacity(rbd_dev->disk);
2423 up_read(&rbd_dev->header_rwsem);
2424
2425 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002426}
2427
Alex Elder34b13182012-07-13 20:35:12 -05002428/*
2429 * Note this shows the features for whatever's mapped, which is not
2430 * necessarily the base image.
2431 */
2432static ssize_t rbd_features_show(struct device *dev,
2433 struct device_attribute *attr, char *buf)
2434{
2435 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2436
2437 return sprintf(buf, "0x%016llx\n",
2438 (unsigned long long) rbd_dev->mapping.features);
2439}
2440
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002441static ssize_t rbd_major_show(struct device *dev,
2442 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002443{
Alex Elder593a9e72012-02-07 12:03:37 -06002444 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002445
2446 return sprintf(buf, "%d\n", rbd_dev->major);
2447}
2448
2449static ssize_t rbd_client_id_show(struct device *dev,
2450 struct device_attribute *attr, char *buf)
2451{
Alex Elder593a9e72012-02-07 12:03:37 -06002452 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002453
Alex Elder1dbb4392012-01-24 10:08:37 -06002454 return sprintf(buf, "client%lld\n",
2455 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002456}
2457
2458static ssize_t rbd_pool_show(struct device *dev,
2459 struct device_attribute *attr, char *buf)
2460{
Alex Elder593a9e72012-02-07 12:03:37 -06002461 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002462
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002463 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002464}
2465
Alex Elder9bb2f332012-07-12 10:46:35 -05002466static ssize_t rbd_pool_id_show(struct device *dev,
2467 struct device_attribute *attr, char *buf)
2468{
2469 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2470
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002471 return sprintf(buf, "%llu\n",
2472 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002473}
2474
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002475static ssize_t rbd_name_show(struct device *dev,
2476 struct device_attribute *attr, char *buf)
2477{
Alex Elder593a9e72012-02-07 12:03:37 -06002478 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002479
Alex Eldera92ffdf2012-10-30 19:40:33 -05002480 if (rbd_dev->spec->image_name)
2481 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2482
2483 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002484}
2485
Alex Elder589d30e2012-07-10 20:30:11 -05002486static ssize_t rbd_image_id_show(struct device *dev,
2487 struct device_attribute *attr, char *buf)
2488{
2489 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2490
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002491 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002492}
2493
Alex Elder34b13182012-07-13 20:35:12 -05002494/*
2495 * Shows the name of the currently-mapped snapshot (or
2496 * RBD_SNAP_HEAD_NAME for the base image).
2497 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002498static ssize_t rbd_snap_show(struct device *dev,
2499 struct device_attribute *attr,
2500 char *buf)
2501{
Alex Elder593a9e72012-02-07 12:03:37 -06002502 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002503
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002504 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002505}
2506
Alex Elder86b00e02012-10-25 23:34:42 -05002507/*
2508 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2509 * for the parent image. If there is no parent, simply shows
2510 * "(no parent image)".
2511 */
2512static ssize_t rbd_parent_show(struct device *dev,
2513 struct device_attribute *attr,
2514 char *buf)
2515{
2516 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2517 struct rbd_spec *spec = rbd_dev->parent_spec;
2518 int count;
2519 char *bufp = buf;
2520
2521 if (!spec)
2522 return sprintf(buf, "(no parent image)\n");
2523
2524 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2525 (unsigned long long) spec->pool_id, spec->pool_name);
2526 if (count < 0)
2527 return count;
2528 bufp += count;
2529
2530 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2531 spec->image_name ? spec->image_name : "(unknown)");
2532 if (count < 0)
2533 return count;
2534 bufp += count;
2535
2536 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2537 (unsigned long long) spec->snap_id, spec->snap_name);
2538 if (count < 0)
2539 return count;
2540 bufp += count;
2541
2542 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2543 if (count < 0)
2544 return count;
2545 bufp += count;
2546
2547 return (ssize_t) (bufp - buf);
2548}
2549
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002550static ssize_t rbd_image_refresh(struct device *dev,
2551 struct device_attribute *attr,
2552 const char *buf,
2553 size_t size)
2554{
Alex Elder593a9e72012-02-07 12:03:37 -06002555 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002556 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002557
Alex Elder117973f2012-08-31 17:29:55 -05002558 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002559
2560 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002561}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002562
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002563static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002564static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002565static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2566static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2567static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002568static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002569static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002570static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002571static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2572static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002573static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002574
2575static struct attribute *rbd_attrs[] = {
2576 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002577 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002578 &dev_attr_major.attr,
2579 &dev_attr_client_id.attr,
2580 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002581 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002582 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002583 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002584 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002585 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002586 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002587 NULL
2588};
2589
2590static struct attribute_group rbd_attr_group = {
2591 .attrs = rbd_attrs,
2592};
2593
2594static const struct attribute_group *rbd_attr_groups[] = {
2595 &rbd_attr_group,
2596 NULL
2597};
2598
2599static void rbd_sysfs_dev_release(struct device *dev)
2600{
2601}
2602
2603static struct device_type rbd_device_type = {
2604 .name = "rbd",
2605 .groups = rbd_attr_groups,
2606 .release = rbd_sysfs_dev_release,
2607};
2608
2609
2610/*
2611 sysfs - snapshots
2612*/
2613
2614static ssize_t rbd_snap_size_show(struct device *dev,
2615 struct device_attribute *attr,
2616 char *buf)
2617{
2618 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2619
Josh Durgin35915382011-12-05 18:25:13 -08002620 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002621}
2622
2623static ssize_t rbd_snap_id_show(struct device *dev,
2624 struct device_attribute *attr,
2625 char *buf)
2626{
2627 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2628
Josh Durgin35915382011-12-05 18:25:13 -08002629 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002630}
2631
Alex Elder34b13182012-07-13 20:35:12 -05002632static ssize_t rbd_snap_features_show(struct device *dev,
2633 struct device_attribute *attr,
2634 char *buf)
2635{
2636 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2637
2638 return sprintf(buf, "0x%016llx\n",
2639 (unsigned long long) snap->features);
2640}
2641
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002642static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2643static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002644static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002645
2646static struct attribute *rbd_snap_attrs[] = {
2647 &dev_attr_snap_size.attr,
2648 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002649 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002650 NULL,
2651};
2652
2653static struct attribute_group rbd_snap_attr_group = {
2654 .attrs = rbd_snap_attrs,
2655};
2656
2657static void rbd_snap_dev_release(struct device *dev)
2658{
2659 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2660 kfree(snap->name);
2661 kfree(snap);
2662}
2663
2664static const struct attribute_group *rbd_snap_attr_groups[] = {
2665 &rbd_snap_attr_group,
2666 NULL
2667};
2668
2669static struct device_type rbd_snap_device_type = {
2670 .groups = rbd_snap_attr_groups,
2671 .release = rbd_snap_dev_release,
2672};
2673
Alex Elder8b8fb992012-10-26 17:25:24 -05002674static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2675{
2676 kref_get(&spec->kref);
2677
2678 return spec;
2679}
2680
2681static void rbd_spec_free(struct kref *kref);
2682static void rbd_spec_put(struct rbd_spec *spec)
2683{
2684 if (spec)
2685 kref_put(&spec->kref, rbd_spec_free);
2686}
2687
2688static struct rbd_spec *rbd_spec_alloc(void)
2689{
2690 struct rbd_spec *spec;
2691
2692 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2693 if (!spec)
2694 return NULL;
2695 kref_init(&spec->kref);
2696
2697 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2698
2699 return spec;
2700}
2701
2702static void rbd_spec_free(struct kref *kref)
2703{
2704 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2705
2706 kfree(spec->pool_name);
2707 kfree(spec->image_id);
2708 kfree(spec->image_name);
2709 kfree(spec->snap_name);
2710 kfree(spec);
2711}
2712
Alex Eldercc344fa2013-02-19 12:25:56 -06002713static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05002714 struct rbd_spec *spec)
2715{
2716 struct rbd_device *rbd_dev;
2717
2718 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2719 if (!rbd_dev)
2720 return NULL;
2721
2722 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06002723 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05002724 INIT_LIST_HEAD(&rbd_dev->node);
2725 INIT_LIST_HEAD(&rbd_dev->snaps);
2726 init_rwsem(&rbd_dev->header_rwsem);
2727
2728 rbd_dev->spec = spec;
2729 rbd_dev->rbd_client = rbdc;
2730
Alex Elder0903e872012-11-14 12:25:19 -06002731 /* Initialize the layout used for all rbd requests */
2732
2733 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2734 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2735 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2736 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2737
Alex Elderc53d5892012-10-25 23:34:42 -05002738 return rbd_dev;
2739}
2740
2741static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2742{
Alex Elder86b00e02012-10-25 23:34:42 -05002743 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002744 kfree(rbd_dev->header_name);
2745 rbd_put_client(rbd_dev->rbd_client);
2746 rbd_spec_put(rbd_dev->spec);
2747 kfree(rbd_dev);
2748}
2749
Alex Elder304f6802012-08-31 17:29:52 -05002750static bool rbd_snap_registered(struct rbd_snap *snap)
2751{
2752 bool ret = snap->dev.type == &rbd_snap_device_type;
2753 bool reg = device_is_registered(&snap->dev);
2754
2755 rbd_assert(!ret ^ reg);
2756
2757 return ret;
2758}
2759
Alex Elder41f38c22012-10-25 23:34:40 -05002760static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002761{
2762 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002763 if (device_is_registered(&snap->dev))
2764 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002765}
2766
Alex Elder14e70852012-07-19 09:09:27 -05002767static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002768 struct device *parent)
2769{
2770 struct device *dev = &snap->dev;
2771 int ret;
2772
2773 dev->type = &rbd_snap_device_type;
2774 dev->parent = parent;
2775 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002776 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002777 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2778
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002779 ret = device_register(dev);
2780
2781 return ret;
2782}
2783
Alex Elder4e891e02012-07-10 20:30:10 -05002784static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002785 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002786 u64 snap_id, u64 snap_size,
2787 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002788{
Alex Elder4e891e02012-07-10 20:30:10 -05002789 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002790 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002791
2792 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002793 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002794 return ERR_PTR(-ENOMEM);
2795
2796 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002797 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002798 if (!snap->name)
2799 goto err;
2800
Alex Elderc8d18422012-07-10 20:30:11 -05002801 snap->id = snap_id;
2802 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002803 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002804
2805 return snap;
2806
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002807err:
2808 kfree(snap->name);
2809 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002810
2811 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002812}
2813
Alex Eldercd892122012-07-03 16:01:19 -05002814static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2815 u64 *snap_size, u64 *snap_features)
2816{
2817 char *snap_name;
2818
2819 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2820
2821 *snap_size = rbd_dev->header.snap_sizes[which];
2822 *snap_features = 0; /* No features for v1 */
2823
2824 /* Skip over names until we find the one we are looking for */
2825
2826 snap_name = rbd_dev->header.snap_names;
2827 while (which--)
2828 snap_name += strlen(snap_name) + 1;
2829
2830 return snap_name;
2831}
2832
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002833/*
Alex Elder9d475de2012-07-03 16:01:19 -05002834 * Get the size and object order for an image snapshot, or if
2835 * snap_id is CEPH_NOSNAP, gets this information for the base
2836 * image.
2837 */
2838static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2839 u8 *order, u64 *snap_size)
2840{
2841 __le64 snapid = cpu_to_le64(snap_id);
2842 int ret;
2843 struct {
2844 u8 order;
2845 __le64 size;
2846 } __attribute__ ((packed)) size_buf = { 0 };
2847
Alex Elder36be9a72013-01-19 00:30:28 -06002848 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05002849 "rbd", "get_size",
2850 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002851 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002852 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05002853 if (ret < 0)
2854 return ret;
2855
2856 *order = size_buf.order;
2857 *snap_size = le64_to_cpu(size_buf.size);
2858
2859 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2860 (unsigned long long) snap_id, (unsigned int) *order,
2861 (unsigned long long) *snap_size);
2862
2863 return 0;
2864}
2865
2866static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2867{
2868 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2869 &rbd_dev->header.obj_order,
2870 &rbd_dev->header.image_size);
2871}
2872
Alex Elder1e130192012-07-03 16:01:19 -05002873static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2874{
2875 void *reply_buf;
2876 int ret;
2877 void *p;
2878
2879 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2880 if (!reply_buf)
2881 return -ENOMEM;
2882
Alex Elder36be9a72013-01-19 00:30:28 -06002883 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder1e130192012-07-03 16:01:19 -05002884 "rbd", "get_object_prefix",
2885 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002886 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002887 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05002888 if (ret < 0)
2889 goto out;
2890
2891 p = reply_buf;
2892 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2893 p + RBD_OBJ_PREFIX_LEN_MAX,
2894 NULL, GFP_NOIO);
2895
2896 if (IS_ERR(rbd_dev->header.object_prefix)) {
2897 ret = PTR_ERR(rbd_dev->header.object_prefix);
2898 rbd_dev->header.object_prefix = NULL;
2899 } else {
2900 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2901 }
2902
2903out:
2904 kfree(reply_buf);
2905
2906 return ret;
2907}
2908
Alex Elderb1b54022012-07-03 16:01:19 -05002909static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2910 u64 *snap_features)
2911{
2912 __le64 snapid = cpu_to_le64(snap_id);
2913 struct {
2914 __le64 features;
2915 __le64 incompat;
2916 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002917 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002918 int ret;
2919
Alex Elder36be9a72013-01-19 00:30:28 -06002920 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05002921 "rbd", "get_features",
2922 (char *) &snapid, sizeof (snapid),
2923 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002924 NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002925 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05002926 if (ret < 0)
2927 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002928
2929 incompat = le64_to_cpu(features_buf.incompat);
2930 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002931 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002932
Alex Elderb1b54022012-07-03 16:01:19 -05002933 *snap_features = le64_to_cpu(features_buf.features);
2934
2935 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2936 (unsigned long long) snap_id,
2937 (unsigned long long) *snap_features,
2938 (unsigned long long) le64_to_cpu(features_buf.incompat));
2939
2940 return 0;
2941}
2942
2943static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2944{
2945 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2946 &rbd_dev->header.features);
2947}
2948
Alex Elder86b00e02012-10-25 23:34:42 -05002949static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2950{
2951 struct rbd_spec *parent_spec;
2952 size_t size;
2953 void *reply_buf = NULL;
2954 __le64 snapid;
2955 void *p;
2956 void *end;
2957 char *image_id;
2958 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002959 int ret;
2960
2961 parent_spec = rbd_spec_alloc();
2962 if (!parent_spec)
2963 return -ENOMEM;
2964
2965 size = sizeof (__le64) + /* pool_id */
2966 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2967 sizeof (__le64) + /* snap_id */
2968 sizeof (__le64); /* overlap */
2969 reply_buf = kmalloc(size, GFP_KERNEL);
2970 if (!reply_buf) {
2971 ret = -ENOMEM;
2972 goto out_err;
2973 }
2974
2975 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06002976 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05002977 "rbd", "get_parent",
2978 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002979 (char *) reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002980 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05002981 if (ret < 0)
2982 goto out_err;
2983
2984 ret = -ERANGE;
2985 p = reply_buf;
2986 end = (char *) reply_buf + size;
2987 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2988 if (parent_spec->pool_id == CEPH_NOPOOL)
2989 goto out; /* No parent? No problem. */
2990
Alex Elder0903e872012-11-14 12:25:19 -06002991 /* The ceph file layout needs to fit pool id in 32 bits */
2992
2993 ret = -EIO;
2994 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2995 goto out;
2996
Alex Elder979ed482012-11-01 08:39:26 -05002997 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002998 if (IS_ERR(image_id)) {
2999 ret = PTR_ERR(image_id);
3000 goto out_err;
3001 }
3002 parent_spec->image_id = image_id;
3003 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3004 ceph_decode_64_safe(&p, end, overlap, out_err);
3005
3006 rbd_dev->parent_overlap = overlap;
3007 rbd_dev->parent_spec = parent_spec;
3008 parent_spec = NULL; /* rbd_dev now owns this */
3009out:
3010 ret = 0;
3011out_err:
3012 kfree(reply_buf);
3013 rbd_spec_put(parent_spec);
3014
3015 return ret;
3016}
3017
Alex Elder9e15b772012-10-30 19:40:33 -05003018static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3019{
3020 size_t image_id_size;
3021 char *image_id;
3022 void *p;
3023 void *end;
3024 size_t size;
3025 void *reply_buf = NULL;
3026 size_t len = 0;
3027 char *image_name = NULL;
3028 int ret;
3029
3030 rbd_assert(!rbd_dev->spec->image_name);
3031
Alex Elder69e7a022012-11-01 08:39:26 -05003032 len = strlen(rbd_dev->spec->image_id);
3033 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003034 image_id = kmalloc(image_id_size, GFP_KERNEL);
3035 if (!image_id)
3036 return NULL;
3037
3038 p = image_id;
3039 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05003040 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05003041
3042 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3043 reply_buf = kmalloc(size, GFP_KERNEL);
3044 if (!reply_buf)
3045 goto out;
3046
Alex Elder36be9a72013-01-19 00:30:28 -06003047 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003048 "rbd", "dir_get_name",
3049 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06003050 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05003051 if (ret < 0)
3052 goto out;
3053 p = reply_buf;
3054 end = (char *) reply_buf + size;
3055 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3056 if (IS_ERR(image_name))
3057 image_name = NULL;
3058 else
3059 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3060out:
3061 kfree(reply_buf);
3062 kfree(image_id);
3063
3064 return image_name;
3065}
3066
3067/*
3068 * When a parent image gets probed, we only have the pool, image,
3069 * and snapshot ids but not the names of any of them. This call
3070 * is made later to fill in those names. It has to be done after
3071 * rbd_dev_snaps_update() has completed because some of the
3072 * information (in particular, snapshot name) is not available
3073 * until then.
3074 */
3075static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3076{
3077 struct ceph_osd_client *osdc;
3078 const char *name;
3079 void *reply_buf = NULL;
3080 int ret;
3081
3082 if (rbd_dev->spec->pool_name)
3083 return 0; /* Already have the names */
3084
3085 /* Look up the pool name */
3086
3087 osdc = &rbd_dev->rbd_client->client->osdc;
3088 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003089 if (!name) {
3090 rbd_warn(rbd_dev, "there is no pool with id %llu",
3091 rbd_dev->spec->pool_id); /* Really a BUG() */
3092 return -EIO;
3093 }
Alex Elder9e15b772012-10-30 19:40:33 -05003094
3095 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3096 if (!rbd_dev->spec->pool_name)
3097 return -ENOMEM;
3098
3099 /* Fetch the image name; tolerate failure here */
3100
3101 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05003102 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05003103 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05003104 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05003105 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003106
3107 /* Look up the snapshot name. */
3108
3109 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3110 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003111 rbd_warn(rbd_dev, "no snapshot with id %llu",
3112 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003113 ret = -EIO;
3114 goto out_err;
3115 }
3116 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3117 if(!rbd_dev->spec->snap_name)
3118 goto out_err;
3119
3120 return 0;
3121out_err:
3122 kfree(reply_buf);
3123 kfree(rbd_dev->spec->pool_name);
3124 rbd_dev->spec->pool_name = NULL;
3125
3126 return ret;
3127}
3128
Alex Elder6e14b1a2012-07-03 16:01:19 -05003129static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003130{
3131 size_t size;
3132 int ret;
3133 void *reply_buf;
3134 void *p;
3135 void *end;
3136 u64 seq;
3137 u32 snap_count;
3138 struct ceph_snap_context *snapc;
3139 u32 i;
3140
3141 /*
3142 * We'll need room for the seq value (maximum snapshot id),
3143 * snapshot count, and array of that many snapshot ids.
3144 * For now we have a fixed upper limit on the number we're
3145 * prepared to receive.
3146 */
3147 size = sizeof (__le64) + sizeof (__le32) +
3148 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3149 reply_buf = kzalloc(size, GFP_KERNEL);
3150 if (!reply_buf)
3151 return -ENOMEM;
3152
Alex Elder36be9a72013-01-19 00:30:28 -06003153 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder35d489f2012-07-03 16:01:19 -05003154 "rbd", "get_snapcontext",
3155 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003156 reply_buf, size, ver);
Alex Elder36be9a72013-01-19 00:30:28 -06003157 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003158 if (ret < 0)
3159 goto out;
3160
3161 ret = -ERANGE;
3162 p = reply_buf;
3163 end = (char *) reply_buf + size;
3164 ceph_decode_64_safe(&p, end, seq, out);
3165 ceph_decode_32_safe(&p, end, snap_count, out);
3166
3167 /*
3168 * Make sure the reported number of snapshot ids wouldn't go
3169 * beyond the end of our buffer. But before checking that,
3170 * make sure the computed size of the snapshot context we
3171 * allocate is representable in a size_t.
3172 */
3173 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3174 / sizeof (u64)) {
3175 ret = -EINVAL;
3176 goto out;
3177 }
3178 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3179 goto out;
3180
3181 size = sizeof (struct ceph_snap_context) +
3182 snap_count * sizeof (snapc->snaps[0]);
3183 snapc = kmalloc(size, GFP_KERNEL);
3184 if (!snapc) {
3185 ret = -ENOMEM;
3186 goto out;
3187 }
3188
3189 atomic_set(&snapc->nref, 1);
3190 snapc->seq = seq;
3191 snapc->num_snaps = snap_count;
3192 for (i = 0; i < snap_count; i++)
3193 snapc->snaps[i] = ceph_decode_64(&p);
3194
3195 rbd_dev->header.snapc = snapc;
3196
3197 dout(" snap context seq = %llu, snap_count = %u\n",
3198 (unsigned long long) seq, (unsigned int) snap_count);
3199
3200out:
3201 kfree(reply_buf);
3202
3203 return 0;
3204}
3205
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003206static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3207{
3208 size_t size;
3209 void *reply_buf;
3210 __le64 snap_id;
3211 int ret;
3212 void *p;
3213 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003214 char *snap_name;
3215
3216 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3217 reply_buf = kmalloc(size, GFP_KERNEL);
3218 if (!reply_buf)
3219 return ERR_PTR(-ENOMEM);
3220
3221 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003222 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003223 "rbd", "get_snapshot_name",
3224 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003225 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003226 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003227 if (ret < 0)
3228 goto out;
3229
3230 p = reply_buf;
3231 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003232 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003233 if (IS_ERR(snap_name)) {
3234 ret = PTR_ERR(snap_name);
3235 goto out;
3236 } else {
3237 dout(" snap_id 0x%016llx snap_name = %s\n",
3238 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3239 }
3240 kfree(reply_buf);
3241
3242 return snap_name;
3243out:
3244 kfree(reply_buf);
3245
3246 return ERR_PTR(ret);
3247}
3248
3249static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3250 u64 *snap_size, u64 *snap_features)
3251{
Alex Eldere0b49862013-01-09 14:44:18 -06003252 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003253 u8 order;
3254 int ret;
3255
3256 snap_id = rbd_dev->header.snapc->snaps[which];
3257 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3258 if (ret)
3259 return ERR_PTR(ret);
3260 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3261 if (ret)
3262 return ERR_PTR(ret);
3263
3264 return rbd_dev_v2_snap_name(rbd_dev, which);
3265}
3266
3267static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3268 u64 *snap_size, u64 *snap_features)
3269{
3270 if (rbd_dev->image_format == 1)
3271 return rbd_dev_v1_snap_info(rbd_dev, which,
3272 snap_size, snap_features);
3273 if (rbd_dev->image_format == 2)
3274 return rbd_dev_v2_snap_info(rbd_dev, which,
3275 snap_size, snap_features);
3276 return ERR_PTR(-EINVAL);
3277}
3278
Alex Elder117973f2012-08-31 17:29:55 -05003279static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3280{
3281 int ret;
3282 __u8 obj_order;
3283
3284 down_write(&rbd_dev->header_rwsem);
3285
3286 /* Grab old order first, to see if it changes */
3287
3288 obj_order = rbd_dev->header.obj_order,
3289 ret = rbd_dev_v2_image_size(rbd_dev);
3290 if (ret)
3291 goto out;
3292 if (rbd_dev->header.obj_order != obj_order) {
3293 ret = -EIO;
3294 goto out;
3295 }
3296 rbd_update_mapping_size(rbd_dev);
3297
3298 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3299 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3300 if (ret)
3301 goto out;
3302 ret = rbd_dev_snaps_update(rbd_dev);
3303 dout("rbd_dev_snaps_update returned %d\n", ret);
3304 if (ret)
3305 goto out;
3306 ret = rbd_dev_snaps_register(rbd_dev);
3307 dout("rbd_dev_snaps_register returned %d\n", ret);
3308out:
3309 up_write(&rbd_dev->header_rwsem);
3310
3311 return ret;
3312}
3313
Alex Elder9d475de2012-07-03 16:01:19 -05003314/*
Alex Elder35938152012-08-02 11:29:46 -05003315 * Scan the rbd device's current snapshot list and compare it to the
3316 * newly-received snapshot context. Remove any existing snapshots
3317 * not present in the new snapshot context. Add a new snapshot for
3318 * any snaphots in the snapshot context not in the current list.
3319 * And verify there are no changes to snapshots we already know
3320 * about.
3321 *
3322 * Assumes the snapshots in the snapshot context are sorted by
3323 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3324 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003325 */
Alex Elder304f6802012-08-31 17:29:52 -05003326static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003327{
Alex Elder35938152012-08-02 11:29:46 -05003328 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3329 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05003330 struct list_head *head = &rbd_dev->snaps;
3331 struct list_head *links = head->next;
3332 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003333
Alex Elder9fcbb802012-08-23 23:48:49 -05003334 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05003335 while (index < snap_count || links != head) {
3336 u64 snap_id;
3337 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05003338 char *snap_name;
3339 u64 snap_size = 0;
3340 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003341
Alex Elder35938152012-08-02 11:29:46 -05003342 snap_id = index < snap_count ? snapc->snaps[index]
3343 : CEPH_NOSNAP;
3344 snap = links != head ? list_entry(links, struct rbd_snap, node)
3345 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05003346 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003347
Alex Elder35938152012-08-02 11:29:46 -05003348 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3349 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003350
Alex Elder6d292902013-01-14 12:43:31 -06003351 /*
3352 * A previously-existing snapshot is not in
3353 * the new snap context.
3354 *
3355 * If the now missing snapshot is the one the
3356 * image is mapped to, clear its exists flag
3357 * so we can avoid sending any more requests
3358 * to it.
3359 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003360 if (rbd_dev->spec->snap_id == snap->id)
Alex Elder6d292902013-01-14 12:43:31 -06003361 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder41f38c22012-10-25 23:34:40 -05003362 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05003363 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003364 rbd_dev->spec->snap_id == snap->id ?
3365 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05003366 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003367
Alex Elder35938152012-08-02 11:29:46 -05003368 /* Done with this list entry; advance */
3369
3370 links = next;
3371 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003372 }
Alex Elder35938152012-08-02 11:29:46 -05003373
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003374 snap_name = rbd_dev_snap_info(rbd_dev, index,
3375 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05003376 if (IS_ERR(snap_name))
3377 return PTR_ERR(snap_name);
3378
Alex Elder9fcbb802012-08-23 23:48:49 -05003379 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3380 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05003381 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3382 struct rbd_snap *new_snap;
3383
3384 /* We haven't seen this snapshot before */
3385
Alex Elderc8d18422012-07-10 20:30:11 -05003386 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05003387 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05003388 if (IS_ERR(new_snap)) {
3389 int err = PTR_ERR(new_snap);
3390
3391 dout(" failed to add dev, error %d\n", err);
3392
3393 return err;
3394 }
Alex Elder35938152012-08-02 11:29:46 -05003395
3396 /* New goes before existing, or at end of list */
3397
Alex Elder9fcbb802012-08-23 23:48:49 -05003398 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05003399 if (snap)
3400 list_add_tail(&new_snap->node, &snap->node);
3401 else
Alex Elder523f3252012-08-30 00:16:37 -05003402 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05003403 } else {
3404 /* Already have this one */
3405
Alex Elder9fcbb802012-08-23 23:48:49 -05003406 dout(" already present\n");
3407
Alex Eldercd892122012-07-03 16:01:19 -05003408 rbd_assert(snap->size == snap_size);
Alex Elderaafb230e2012-09-06 16:00:54 -05003409 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05003410 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05003411
3412 /* Done with this list entry; advance */
3413
3414 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003415 }
Alex Elder35938152012-08-02 11:29:46 -05003416
3417 /* Advance to the next entry in the snapshot context */
3418
3419 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003420 }
Alex Elder9fcbb802012-08-23 23:48:49 -05003421 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003422
3423 return 0;
3424}
3425
Alex Elder304f6802012-08-31 17:29:52 -05003426/*
3427 * Scan the list of snapshots and register the devices for any that
3428 * have not already been registered.
3429 */
3430static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3431{
3432 struct rbd_snap *snap;
3433 int ret = 0;
3434
Alex Elder37206ee2013-02-20 17:32:08 -06003435 dout("%s:\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05003436 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3437 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05003438
3439 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3440 if (!rbd_snap_registered(snap)) {
3441 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3442 if (ret < 0)
3443 break;
3444 }
3445 }
3446 dout("%s: returning %d\n", __func__, ret);
3447
3448 return ret;
3449}
3450
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003451static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3452{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003453 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003454 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003455
3456 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003457
Alex Eldercd789ab2012-08-30 00:16:38 -05003458 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003459 dev->bus = &rbd_bus_type;
3460 dev->type = &rbd_device_type;
3461 dev->parent = &rbd_root_dev;
3462 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003463 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003464 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003465
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003466 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003467
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003468 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003469}
3470
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003471static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3472{
3473 device_unregister(&rbd_dev->dev);
3474}
3475
Alex Eldere2839302012-08-29 17:11:06 -05003476static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003477
3478/*
Alex Elder499afd52012-02-02 08:13:29 -06003479 * Get a unique rbd identifier for the given new rbd_dev, and add
3480 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003481 */
Alex Eldere2839302012-08-29 17:11:06 -05003482static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003483{
Alex Eldere2839302012-08-29 17:11:06 -05003484 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003485
3486 spin_lock(&rbd_dev_list_lock);
3487 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3488 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003489 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3490 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003491}
Alex Elderb7f23c32012-01-29 13:57:43 -06003492
Alex Elder1ddbe942012-01-29 13:57:44 -06003493/*
Alex Elder499afd52012-02-02 08:13:29 -06003494 * Remove an rbd_dev from the global list, and record that its
3495 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003496 */
Alex Eldere2839302012-08-29 17:11:06 -05003497static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003498{
Alex Elderd184f6b2012-01-29 13:57:44 -06003499 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003500 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003501 int max_id;
3502
Alex Elderaafb230e2012-09-06 16:00:54 -05003503 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003504
Alex Eldere2839302012-08-29 17:11:06 -05003505 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3506 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003507 spin_lock(&rbd_dev_list_lock);
3508 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003509
3510 /*
3511 * If the id being "put" is not the current maximum, there
3512 * is nothing special we need to do.
3513 */
Alex Eldere2839302012-08-29 17:11:06 -05003514 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003515 spin_unlock(&rbd_dev_list_lock);
3516 return;
3517 }
3518
3519 /*
3520 * We need to update the current maximum id. Search the
3521 * list to find out what it is. We're more likely to find
3522 * the maximum at the end, so search the list backward.
3523 */
3524 max_id = 0;
3525 list_for_each_prev(tmp, &rbd_dev_list) {
3526 struct rbd_device *rbd_dev;
3527
3528 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003529 if (rbd_dev->dev_id > max_id)
3530 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003531 }
Alex Elder499afd52012-02-02 08:13:29 -06003532 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003533
Alex Elder1ddbe942012-01-29 13:57:44 -06003534 /*
Alex Eldere2839302012-08-29 17:11:06 -05003535 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003536 * which case it now accurately reflects the new maximum.
3537 * Be careful not to overwrite the maximum value in that
3538 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003539 */
Alex Eldere2839302012-08-29 17:11:06 -05003540 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3541 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003542}
3543
Alex Eldera725f65e2012-02-02 08:13:30 -06003544/*
Alex Eldere28fff262012-02-02 08:13:30 -06003545 * Skips over white space at *buf, and updates *buf to point to the
3546 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003547 * the token (string of non-white space characters) found. Note
3548 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003549 */
3550static inline size_t next_token(const char **buf)
3551{
3552 /*
3553 * These are the characters that produce nonzero for
3554 * isspace() in the "C" and "POSIX" locales.
3555 */
3556 const char *spaces = " \f\n\r\t\v";
3557
3558 *buf += strspn(*buf, spaces); /* Find start of token */
3559
3560 return strcspn(*buf, spaces); /* Return token length */
3561}
3562
3563/*
3564 * Finds the next token in *buf, and if the provided token buffer is
3565 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003566 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3567 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003568 *
3569 * Returns the length of the token found (not including the '\0').
3570 * Return value will be 0 if no token is found, and it will be >=
3571 * token_size if the token would not fit.
3572 *
Alex Elder593a9e72012-02-07 12:03:37 -06003573 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003574 * found token. Note that this occurs even if the token buffer is
3575 * too small to hold it.
3576 */
3577static inline size_t copy_token(const char **buf,
3578 char *token,
3579 size_t token_size)
3580{
3581 size_t len;
3582
3583 len = next_token(buf);
3584 if (len < token_size) {
3585 memcpy(token, *buf, len);
3586 *(token + len) = '\0';
3587 }
3588 *buf += len;
3589
3590 return len;
3591}
3592
3593/*
Alex Elderea3352f2012-07-09 21:04:23 -05003594 * Finds the next token in *buf, dynamically allocates a buffer big
3595 * enough to hold a copy of it, and copies the token into the new
3596 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3597 * that a duplicate buffer is created even for a zero-length token.
3598 *
3599 * Returns a pointer to the newly-allocated duplicate, or a null
3600 * pointer if memory for the duplicate was not available. If
3601 * the lenp argument is a non-null pointer, the length of the token
3602 * (not including the '\0') is returned in *lenp.
3603 *
3604 * If successful, the *buf pointer will be updated to point beyond
3605 * the end of the found token.
3606 *
3607 * Note: uses GFP_KERNEL for allocation.
3608 */
3609static inline char *dup_token(const char **buf, size_t *lenp)
3610{
3611 char *dup;
3612 size_t len;
3613
3614 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003615 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003616 if (!dup)
3617 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003618 *(dup + len) = '\0';
3619 *buf += len;
3620
3621 if (lenp)
3622 *lenp = len;
3623
3624 return dup;
3625}
3626
3627/*
Alex Elder859c31d2012-10-25 23:34:42 -05003628 * Parse the options provided for an "rbd add" (i.e., rbd image
3629 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3630 * and the data written is passed here via a NUL-terminated buffer.
3631 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003632 *
Alex Elder859c31d2012-10-25 23:34:42 -05003633 * The information extracted from these options is recorded in
3634 * the other parameters which return dynamically-allocated
3635 * structures:
3636 * ceph_opts
3637 * The address of a pointer that will refer to a ceph options
3638 * structure. Caller must release the returned pointer using
3639 * ceph_destroy_options() when it is no longer needed.
3640 * rbd_opts
3641 * Address of an rbd options pointer. Fully initialized by
3642 * this function; caller must release with kfree().
3643 * spec
3644 * Address of an rbd image specification pointer. Fully
3645 * initialized by this function based on parsed options.
3646 * Caller must release with rbd_spec_put().
3647 *
3648 * The options passed take this form:
3649 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3650 * where:
3651 * <mon_addrs>
3652 * A comma-separated list of one or more monitor addresses.
3653 * A monitor address is an ip address, optionally followed
3654 * by a port number (separated by a colon).
3655 * I.e.: ip1[:port1][,ip2[:port2]...]
3656 * <options>
3657 * A comma-separated list of ceph and/or rbd options.
3658 * <pool_name>
3659 * The name of the rados pool containing the rbd image.
3660 * <image_name>
3661 * The name of the image in that pool to map.
3662 * <snap_id>
3663 * An optional snapshot id. If provided, the mapping will
3664 * present data from the image at the time that snapshot was
3665 * created. The image head is used if no snapshot id is
3666 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003667 */
Alex Elder859c31d2012-10-25 23:34:42 -05003668static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003669 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003670 struct rbd_options **opts,
3671 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003672{
Alex Elderd22f76e2012-07-12 10:46:35 -05003673 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003674 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003675 const char *mon_addrs;
3676 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003677 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003678 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003679 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003680 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003681
3682 /* The first four tokens are required */
3683
Alex Elder7ef32142012-02-02 08:13:30 -06003684 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05003685 if (!len) {
3686 rbd_warn(NULL, "no monitor address(es) provided");
3687 return -EINVAL;
3688 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003689 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003690 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003691 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003692
Alex Elderdc79b112012-10-25 23:34:41 -05003693 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003694 options = dup_token(&buf, NULL);
3695 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003696 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003697 if (!*options) {
3698 rbd_warn(NULL, "no options provided");
3699 goto out_err;
3700 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003701
Alex Elder859c31d2012-10-25 23:34:42 -05003702 spec = rbd_spec_alloc();
3703 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003704 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003705
3706 spec->pool_name = dup_token(&buf, NULL);
3707 if (!spec->pool_name)
3708 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003709 if (!*spec->pool_name) {
3710 rbd_warn(NULL, "no pool name provided");
3711 goto out_err;
3712 }
Alex Eldere28fff262012-02-02 08:13:30 -06003713
Alex Elder69e7a022012-11-01 08:39:26 -05003714 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003715 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003716 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003717 if (!*spec->image_name) {
3718 rbd_warn(NULL, "no image name provided");
3719 goto out_err;
3720 }
Alex Eldere28fff262012-02-02 08:13:30 -06003721
Alex Elderf28e5652012-10-25 23:34:41 -05003722 /*
3723 * Snapshot name is optional; default is to use "-"
3724 * (indicating the head/no snapshot).
3725 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003726 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003727 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003728 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3729 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003730 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003731 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003732 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003733 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003734 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003735 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003736 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003737 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003738
Alex Elder0ddebc02012-10-25 23:34:41 -05003739 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003740
Alex Elder4e9afeb2012-10-25 23:34:41 -05003741 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3742 if (!rbd_opts)
3743 goto out_mem;
3744
3745 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003746
Alex Elder859c31d2012-10-25 23:34:42 -05003747 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003748 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003749 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003750 if (IS_ERR(copts)) {
3751 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003752 goto out_err;
3753 }
Alex Elder859c31d2012-10-25 23:34:42 -05003754 kfree(options);
3755
3756 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003757 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003758 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003759
Alex Elderdc79b112012-10-25 23:34:41 -05003760 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003761out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003762 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003763out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003764 kfree(rbd_opts);
3765 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003766 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003767
Alex Elderdc79b112012-10-25 23:34:41 -05003768 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003769}
3770
Alex Elder589d30e2012-07-10 20:30:11 -05003771/*
3772 * An rbd format 2 image has a unique identifier, distinct from the
3773 * name given to it by the user. Internally, that identifier is
3774 * what's used to specify the names of objects related to the image.
3775 *
3776 * A special "rbd id" object is used to map an rbd image name to its
3777 * id. If that object doesn't exist, then there is no v2 rbd image
3778 * with the supplied name.
3779 *
3780 * This function will record the given rbd_dev's image_id field if
3781 * it can be determined, and in that case will return 0. If any
3782 * errors occur a negative errno will be returned and the rbd_dev's
3783 * image_id field will be unchanged (and should be NULL).
3784 */
3785static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3786{
3787 int ret;
3788 size_t size;
3789 char *object_name;
3790 void *response;
3791 void *p;
3792
3793 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003794 * When probing a parent image, the image id is already
3795 * known (and the image name likely is not). There's no
3796 * need to fetch the image id again in this case.
3797 */
3798 if (rbd_dev->spec->image_id)
3799 return 0;
3800
3801 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003802 * First, see if the format 2 image id file exists, and if
3803 * so, get the image's persistent id from it.
3804 */
Alex Elder69e7a022012-11-01 08:39:26 -05003805 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003806 object_name = kmalloc(size, GFP_NOIO);
3807 if (!object_name)
3808 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003809 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003810 dout("rbd id object name is %s\n", object_name);
3811
3812 /* Response will be an encoded string, which includes a length */
3813
3814 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3815 response = kzalloc(size, GFP_NOIO);
3816 if (!response) {
3817 ret = -ENOMEM;
3818 goto out;
3819 }
3820
Alex Elder36be9a72013-01-19 00:30:28 -06003821 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder589d30e2012-07-10 20:30:11 -05003822 "rbd", "get_id",
3823 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003824 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003825 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder589d30e2012-07-10 20:30:11 -05003826 if (ret < 0)
3827 goto out;
3828
3829 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003830 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003831 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003832 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003833 if (IS_ERR(rbd_dev->spec->image_id)) {
3834 ret = PTR_ERR(rbd_dev->spec->image_id);
3835 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003836 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003837 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003838 }
3839out:
3840 kfree(response);
3841 kfree(object_name);
3842
3843 return ret;
3844}
3845
Alex Eldera30b71b2012-07-10 20:30:11 -05003846static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3847{
3848 int ret;
3849 size_t size;
3850
3851 /* Version 1 images have no id; empty string is used */
3852
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003853 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3854 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003855 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003856
3857 /* Record the header object name for this rbd image. */
3858
Alex Elder69e7a022012-11-01 08:39:26 -05003859 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003860 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3861 if (!rbd_dev->header_name) {
3862 ret = -ENOMEM;
3863 goto out_err;
3864 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003865 sprintf(rbd_dev->header_name, "%s%s",
3866 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003867
3868 /* Populate rbd image metadata */
3869
3870 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3871 if (ret < 0)
3872 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003873
3874 /* Version 1 images have no parent (no layering) */
3875
3876 rbd_dev->parent_spec = NULL;
3877 rbd_dev->parent_overlap = 0;
3878
Alex Eldera30b71b2012-07-10 20:30:11 -05003879 rbd_dev->image_format = 1;
3880
3881 dout("discovered version 1 image, header name is %s\n",
3882 rbd_dev->header_name);
3883
3884 return 0;
3885
3886out_err:
3887 kfree(rbd_dev->header_name);
3888 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003889 kfree(rbd_dev->spec->image_id);
3890 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003891
3892 return ret;
3893}
3894
3895static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3896{
3897 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003898 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003899 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003900
3901 /*
3902 * Image id was filled in by the caller. Record the header
3903 * object name for this rbd image.
3904 */
Alex Elder979ed482012-11-01 08:39:26 -05003905 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003906 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3907 if (!rbd_dev->header_name)
3908 return -ENOMEM;
3909 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003910 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003911
3912 /* Get the size and object order for the image */
3913
3914 ret = rbd_dev_v2_image_size(rbd_dev);
3915 if (ret < 0)
3916 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003917
3918 /* Get the object prefix (a.k.a. block_name) for the image */
3919
3920 ret = rbd_dev_v2_object_prefix(rbd_dev);
3921 if (ret < 0)
3922 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003923
Alex Elderd8891402012-10-09 13:50:17 -07003924 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003925
3926 ret = rbd_dev_v2_features(rbd_dev);
3927 if (ret < 0)
3928 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003929
Alex Elder86b00e02012-10-25 23:34:42 -05003930 /* If the image supports layering, get the parent info */
3931
3932 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3933 ret = rbd_dev_v2_parent_info(rbd_dev);
3934 if (ret < 0)
3935 goto out_err;
3936 }
3937
Alex Elder6e14b1a2012-07-03 16:01:19 -05003938 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003939
Alex Elder6e14b1a2012-07-03 16:01:19 -05003940 rbd_dev->header.crypt_type = 0;
3941 rbd_dev->header.comp_type = 0;
3942
3943 /* Get the snapshot context, plus the header version */
3944
3945 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003946 if (ret)
3947 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003948 rbd_dev->header.obj_version = ver;
3949
Alex Eldera30b71b2012-07-10 20:30:11 -05003950 rbd_dev->image_format = 2;
3951
3952 dout("discovered version 2 image, header name is %s\n",
3953 rbd_dev->header_name);
3954
Alex Elder35152972012-08-31 17:29:55 -05003955 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003956out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003957 rbd_dev->parent_overlap = 0;
3958 rbd_spec_put(rbd_dev->parent_spec);
3959 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003960 kfree(rbd_dev->header_name);
3961 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003962 kfree(rbd_dev->header.object_prefix);
3963 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003964
3965 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003966}
3967
Alex Elder83a06262012-10-30 15:47:17 -05003968static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3969{
3970 int ret;
3971
3972 /* no need to lock here, as rbd_dev is not registered yet */
3973 ret = rbd_dev_snaps_update(rbd_dev);
3974 if (ret)
3975 return ret;
3976
Alex Elder9e15b772012-10-30 19:40:33 -05003977 ret = rbd_dev_probe_update_spec(rbd_dev);
3978 if (ret)
3979 goto err_out_snaps;
3980
Alex Elder83a06262012-10-30 15:47:17 -05003981 ret = rbd_dev_set_mapping(rbd_dev);
3982 if (ret)
3983 goto err_out_snaps;
3984
3985 /* generate unique id: find highest unique id, add one */
3986 rbd_dev_id_get(rbd_dev);
3987
3988 /* Fill in the device name, now that we have its id. */
3989 BUILD_BUG_ON(DEV_NAME_LEN
3990 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3991 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3992
3993 /* Get our block major device number. */
3994
3995 ret = register_blkdev(0, rbd_dev->name);
3996 if (ret < 0)
3997 goto err_out_id;
3998 rbd_dev->major = ret;
3999
4000 /* Set up the blkdev mapping. */
4001
4002 ret = rbd_init_disk(rbd_dev);
4003 if (ret)
4004 goto err_out_blkdev;
4005
4006 ret = rbd_bus_add_dev(rbd_dev);
4007 if (ret)
4008 goto err_out_disk;
4009
4010 /*
4011 * At this point cleanup in the event of an error is the job
4012 * of the sysfs code (initiated by rbd_bus_del_dev()).
4013 */
4014 down_write(&rbd_dev->header_rwsem);
4015 ret = rbd_dev_snaps_register(rbd_dev);
4016 up_write(&rbd_dev->header_rwsem);
4017 if (ret)
4018 goto err_out_bus;
4019
Alex Elder9969ebc2013-01-18 12:31:10 -06004020 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05004021 if (ret)
4022 goto err_out_bus;
4023
4024 /* Everything's ready. Announce the disk to the world. */
4025
4026 add_disk(rbd_dev->disk);
4027
4028 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4029 (unsigned long long) rbd_dev->mapping.size);
4030
4031 return ret;
4032err_out_bus:
4033 /* this will also clean up rest of rbd_dev stuff */
4034
4035 rbd_bus_del_dev(rbd_dev);
4036
4037 return ret;
4038err_out_disk:
4039 rbd_free_disk(rbd_dev);
4040err_out_blkdev:
4041 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4042err_out_id:
4043 rbd_dev_id_put(rbd_dev);
4044err_out_snaps:
4045 rbd_remove_all_snaps(rbd_dev);
4046
4047 return ret;
4048}
4049
Alex Eldera30b71b2012-07-10 20:30:11 -05004050/*
4051 * Probe for the existence of the header object for the given rbd
4052 * device. For format 2 images this includes determining the image
4053 * id.
4054 */
4055static int rbd_dev_probe(struct rbd_device *rbd_dev)
4056{
4057 int ret;
4058
4059 /*
4060 * Get the id from the image id object. If it's not a
4061 * format 2 image, we'll get ENOENT back, and we'll assume
4062 * it's a format 1 image.
4063 */
4064 ret = rbd_dev_image_id(rbd_dev);
4065 if (ret)
4066 ret = rbd_dev_v1_probe(rbd_dev);
4067 else
4068 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004069 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05004070 dout("probe failed, returning %d\n", ret);
4071
Alex Elder83a06262012-10-30 15:47:17 -05004072 return ret;
4073 }
4074
4075 ret = rbd_dev_probe_finish(rbd_dev);
4076 if (ret)
4077 rbd_header_free(&rbd_dev->header);
4078
Alex Eldera30b71b2012-07-10 20:30:11 -05004079 return ret;
4080}
4081
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004082static ssize_t rbd_add(struct bus_type *bus,
4083 const char *buf,
4084 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004085{
Alex Eldercb8627c2012-07-09 21:04:23 -05004086 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004087 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004088 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004089 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004090 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004091 struct ceph_osd_client *osdc;
4092 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004093
4094 if (!try_module_get(THIS_MODULE))
4095 return -ENODEV;
4096
Alex Eldera725f65e2012-02-02 08:13:30 -06004097 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004098 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004099 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004100 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004101
Alex Elder9d3997f2012-10-25 23:34:42 -05004102 rbdc = rbd_get_client(ceph_opts);
4103 if (IS_ERR(rbdc)) {
4104 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004105 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004106 }
Alex Elderc53d5892012-10-25 23:34:42 -05004107 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004108
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004109 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004110 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004111 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004112 if (rc < 0)
4113 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004114 spec->pool_id = (u64) rc;
4115
Alex Elder0903e872012-11-14 12:25:19 -06004116 /* The ceph file layout needs to fit pool id in 32 bits */
4117
4118 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4119 rc = -EIO;
4120 goto err_out_client;
4121 }
4122
Alex Elderc53d5892012-10-25 23:34:42 -05004123 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004124 if (!rbd_dev)
4125 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004126 rbdc = NULL; /* rbd_dev now owns this */
4127 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004128
Alex Elderbd4ba652012-10-25 23:34:42 -05004129 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004130 kfree(rbd_opts);
4131 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004132
Alex Eldera30b71b2012-07-10 20:30:11 -05004133 rc = rbd_dev_probe(rbd_dev);
4134 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004135 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004136
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004137 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004138err_out_rbd_dev:
4139 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004140err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004141 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004142err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004143 if (ceph_opts)
4144 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004145 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004146 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004147err_out_module:
4148 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004149
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004150 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004151
4152 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004153}
4154
Alex Elderde71a292012-07-03 16:01:19 -05004155static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004156{
4157 struct list_head *tmp;
4158 struct rbd_device *rbd_dev;
4159
Alex Eldere124a822012-01-29 13:57:44 -06004160 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004161 list_for_each(tmp, &rbd_dev_list) {
4162 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004163 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06004164 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004165 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06004166 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004167 }
Alex Eldere124a822012-01-29 13:57:44 -06004168 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004169 return NULL;
4170}
4171
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004172static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004173{
Alex Elder593a9e72012-02-07 12:03:37 -06004174 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004175
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004176 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004177 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004178
4179 /* clean up and free blkdev */
4180 rbd_free_disk(rbd_dev);
4181 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004182
Alex Elder2ac4e752012-07-10 20:30:10 -05004183 /* release allocated disk header fields */
4184 rbd_header_free(&rbd_dev->header);
4185
Alex Elder32eec682012-02-08 16:11:14 -06004186 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004187 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004188 rbd_assert(rbd_dev->rbd_client != NULL);
4189 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004190
4191 /* release module ref */
4192 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004193}
4194
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004195static ssize_t rbd_remove(struct bus_type *bus,
4196 const char *buf,
4197 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004198{
4199 struct rbd_device *rbd_dev = NULL;
4200 int target_id, rc;
4201 unsigned long ul;
4202 int ret = count;
4203
4204 rc = strict_strtoul(buf, 10, &ul);
4205 if (rc)
4206 return rc;
4207
4208 /* convert to int; abort if we lost anything in the conversion */
4209 target_id = (int) ul;
4210 if (target_id != ul)
4211 return -EINVAL;
4212
4213 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4214
4215 rbd_dev = __rbd_get_dev(target_id);
4216 if (!rbd_dev) {
4217 ret = -ENOENT;
4218 goto done;
4219 }
4220
Alex Eldera14ea262013-02-05 13:23:12 -06004221 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004222 if (rbd_dev->open_count)
Alex Elder42382b72012-11-16 09:29:16 -06004223 ret = -EBUSY;
Alex Elderb82d1672013-01-14 12:43:31 -06004224 else
4225 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
Alex Eldera14ea262013-02-05 13:23:12 -06004226 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004227 if (ret < 0)
Alex Elder42382b72012-11-16 09:29:16 -06004228 goto done;
Alex Elder42382b72012-11-16 09:29:16 -06004229
Alex Elder41f38c22012-10-25 23:34:40 -05004230 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004231 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004232
4233done:
4234 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05004235
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004236 return ret;
4237}
4238
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004239/*
4240 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004241 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004242 */
4243static int rbd_sysfs_init(void)
4244{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004245 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004246
Alex Elderfed4c142012-02-07 12:03:36 -06004247 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004248 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004249 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004250
Alex Elderfed4c142012-02-07 12:03:36 -06004251 ret = bus_register(&rbd_bus_type);
4252 if (ret < 0)
4253 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004254
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004255 return ret;
4256}
4257
4258static void rbd_sysfs_cleanup(void)
4259{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004260 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06004261 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004262}
4263
Alex Eldercc344fa2013-02-19 12:25:56 -06004264static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004265{
4266 int rc;
4267
Alex Elder1e32d342013-01-30 11:13:33 -06004268 if (!libceph_compatible(NULL)) {
4269 rbd_warn(NULL, "libceph incompatibility (quitting)");
4270
4271 return -EINVAL;
4272 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004273 rc = rbd_sysfs_init();
4274 if (rc)
4275 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06004276 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004277 return 0;
4278}
4279
Alex Eldercc344fa2013-02-19 12:25:56 -06004280static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004281{
4282 rbd_sysfs_cleanup();
4283}
4284
4285module_init(rbd_init);
4286module_exit(rbd_exit);
4287
4288MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4289MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4290MODULE_DESCRIPTION("rados block device");
4291
4292/* following authorship retained from original osdblk.c */
4293MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4294
4295MODULE_LICENSE("GPL");