blob: 235cda083137221d6fef8db843cf63e68b9529b0 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
Alex Elder0ec8ce82012-11-09 08:43:15 -060057#define U32_MAX ((u32) (~0U))
Alex Elderdf111be2012-08-09 10:33:26 -070058#define U64_MAX ((u64) (~0ULL))
59
Alex Elderf0f8cef2012-01-29 13:57:44 -060060#define RBD_DRV_NAME "rbd"
61#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070062
63#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
64
Alex Elderd4b125e2012-07-03 16:01:19 -050065#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
66#define RBD_MAX_SNAP_NAME_LEN \
67 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
68
Alex Elder35d489f2012-07-03 16:01:19 -050069#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070070#define RBD_MAX_OPT_LEN 1024
71
72#define RBD_SNAP_HEAD_NAME "-"
73
Alex Elder9e15b772012-10-30 19:40:33 -050074/* This allows a single page to hold an image name sent by OSD */
75#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050076#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050077
Alex Elder1e130192012-07-03 16:01:19 -050078#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050079
Alex Elderd8891402012-10-09 13:50:17 -070080/* Feature bits */
81
82#define RBD_FEATURE_LAYERING 1
83
84/* Features supported by this (client software) implementation. */
85
86#define RBD_FEATURES_ALL (0)
87
Alex Elder81a89792012-02-02 08:13:30 -060088/*
89 * An RBD device name will be "rbd#", where the "rbd" comes from
90 * RBD_DRV_NAME above, and # is a unique integer identifier.
91 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
92 * enough to hold all possible device names.
93 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070094#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060095#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096
Alex Eldercc0538b2012-08-10 13:12:07 -070097#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070098
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099/*
100 * block device image metadata (in-memory version)
101 */
102struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500103 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500104 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500105 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700106 __u8 obj_order;
107 __u8 crypt_type;
108 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109
Alex Elderf84344f2012-08-31 17:29:51 -0500110 /* The remaining fields need to be updated occasionally */
111 u64 image_size;
112 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700113 char *snap_names;
114 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700115
116 u64 obj_version;
117};
118
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500119/*
120 * An rbd image specification.
121 *
122 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500123 * identify an image. Each rbd_dev structure includes a pointer to
124 * an rbd_spec structure that encapsulates this identity.
125 *
126 * Each of the id's in an rbd_spec has an associated name. For a
127 * user-mapped image, the names are supplied and the id's associated
128 * with them are looked up. For a layered image, a parent image is
129 * defined by the tuple, and the names are looked up.
130 *
131 * An rbd_dev structure contains a parent_spec pointer which is
132 * non-null if the image it represents is a child in a layered
133 * image. This pointer will refer to the rbd_spec structure used
134 * by the parent rbd_dev for its own identity (i.e., the structure
135 * is shared between the parent and child).
136 *
137 * Since these structures are populated once, during the discovery
138 * phase of image construction, they are effectively immutable so
139 * we make no effort to synchronize access to them.
140 *
141 * Note that code herein does not assume the image name is known (it
142 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500143 */
144struct rbd_spec {
145 u64 pool_id;
146 char *pool_name;
147
148 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500150
151 u64 snap_id;
152 char *snap_name;
153
154 struct kref kref;
155};
156
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700157struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700158 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700159};
160
161/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600162 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700163 */
164struct rbd_client {
165 struct ceph_client *client;
166 struct kref kref;
167 struct list_head node;
168};
169
170/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600171 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700172 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700173struct rbd_req_status {
174 int done;
Alex Elder8986cb32012-11-08 08:01:39 -0600175 s32 rc;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700176 u64 bytes;
177};
178
179/*
180 * a collection of requests
181 */
182struct rbd_req_coll {
183 int total;
184 int num_done;
185 struct kref kref;
186 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700187};
188
Alex Elderf0f8cef2012-01-29 13:57:44 -0600189/*
190 * a single io request
191 */
192struct rbd_request {
193 struct request *rq; /* blk layer request */
194 struct bio *bio; /* cloned bio */
195 struct page **pages; /* list of used pages */
196 u64 len;
197 int coll_index;
198 struct rbd_req_coll *coll;
199};
200
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800201struct rbd_snap {
202 struct device dev;
203 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800204 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800205 struct list_head node;
206 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500207 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800208};
209
Alex Elderf84344f2012-08-31 17:29:51 -0500210struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500211 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500212 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500213 bool read_only;
214};
215
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700216/*
217 * a single device
218 */
219struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500220 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700221
222 int major; /* blkdev assigned major */
223 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700224
Alex Eldera30b71b2012-07-10 20:30:11 -0500225 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700226 struct rbd_client *rbd_client;
227
228 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
229
230 spinlock_t lock; /* queue lock */
231
232 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600233 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500234 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700235
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500236 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500237
Alex Elder0903e872012-11-14 12:25:19 -0600238 struct ceph_file_layout layout;
239
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700240 struct ceph_osd_event *watch_event;
241 struct ceph_osd_request *watch_request;
242
Alex Elder86b00e02012-10-25 23:34:42 -0500243 struct rbd_spec *parent_spec;
244 u64 parent_overlap;
245
Josh Durginc6666012011-11-21 17:11:12 -0800246 /* protects updating the header */
247 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500248
249 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250
251 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800252
253 /* list of snapshots */
254 struct list_head snaps;
255
256 /* sysfs related */
257 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600258 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800259};
260
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700261static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600262
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700263static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600264static DEFINE_SPINLOCK(rbd_dev_list_lock);
265
Alex Elder432b8582012-01-29 13:57:44 -0600266static LIST_HEAD(rbd_client_list); /* clients */
267static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700268
Alex Elder304f6802012-08-31 17:29:52 -0500269static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
270static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
271
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800272static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500273static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800274
Alex Elderf0f8cef2012-01-29 13:57:44 -0600275static ssize_t rbd_add(struct bus_type *bus, const char *buf,
276 size_t count);
277static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
278 size_t count);
279
280static struct bus_attribute rbd_bus_attrs[] = {
281 __ATTR(add, S_IWUSR, NULL, rbd_add),
282 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
283 __ATTR_NULL
284};
285
286static struct bus_type rbd_bus_type = {
287 .name = "rbd",
288 .bus_attrs = rbd_bus_attrs,
289};
290
291static void rbd_root_dev_release(struct device *dev)
292{
293}
294
295static struct device rbd_root_dev = {
296 .init_name = "rbd",
297 .release = rbd_root_dev_release,
298};
299
Alex Elder06ecc6c2012-11-01 10:17:15 -0500300static __printf(2, 3)
301void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
302{
303 struct va_format vaf;
304 va_list args;
305
306 va_start(args, fmt);
307 vaf.fmt = fmt;
308 vaf.va = &args;
309
310 if (!rbd_dev)
311 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
312 else if (rbd_dev->disk)
313 printk(KERN_WARNING "%s: %s: %pV\n",
314 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
315 else if (rbd_dev->spec && rbd_dev->spec->image_name)
316 printk(KERN_WARNING "%s: image %s: %pV\n",
317 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
318 else if (rbd_dev->spec && rbd_dev->spec->image_id)
319 printk(KERN_WARNING "%s: id %s: %pV\n",
320 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
321 else /* punt */
322 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
323 RBD_DRV_NAME, rbd_dev, &vaf);
324 va_end(args);
325}
326
Alex Elderaafb230e2012-09-06 16:00:54 -0500327#ifdef RBD_DEBUG
328#define rbd_assert(expr) \
329 if (unlikely(!(expr))) { \
330 printk(KERN_ERR "\nAssertion failure in %s() " \
331 "at line %d:\n\n" \
332 "\trbd_assert(%s);\n\n", \
333 __func__, __LINE__, #expr); \
334 BUG(); \
335 }
336#else /* !RBD_DEBUG */
337# define rbd_assert(expr) ((void) 0)
338#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800339
Alex Elder117973f2012-08-31 17:29:55 -0500340static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
341static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700342
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700343static int rbd_open(struct block_device *bdev, fmode_t mode)
344{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600345 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346
Alex Elderf84344f2012-08-31 17:29:51 -0500347 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700348 return -EROFS;
349
Alex Elder42382b72012-11-16 09:29:16 -0600350 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600351 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500352 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600353 rbd_dev->open_count++;
354 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700355
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356 return 0;
357}
358
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800359static int rbd_release(struct gendisk *disk, fmode_t mode)
360{
361 struct rbd_device *rbd_dev = disk->private_data;
362
Alex Elder42382b72012-11-16 09:29:16 -0600363 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
364 rbd_assert(rbd_dev->open_count > 0);
365 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600366 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600367 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800368
369 return 0;
370}
371
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700372static const struct block_device_operations rbd_bd_ops = {
373 .owner = THIS_MODULE,
374 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800375 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700376};
377
378/*
379 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500380 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700381 */
Alex Elderf8c38922012-08-10 13:12:07 -0700382static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383{
384 struct rbd_client *rbdc;
385 int ret = -ENOMEM;
386
387 dout("rbd_client_create\n");
388 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
389 if (!rbdc)
390 goto out_opt;
391
392 kref_init(&rbdc->kref);
393 INIT_LIST_HEAD(&rbdc->node);
394
Alex Elderbc534d82012-01-29 13:57:44 -0600395 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
396
Alex Elder43ae4702012-07-03 16:01:18 -0500397 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700398 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600399 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500400 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700401
402 ret = ceph_open_session(rbdc->client);
403 if (ret < 0)
404 goto out_err;
405
Alex Elder432b8582012-01-29 13:57:44 -0600406 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700407 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600408 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700409
Alex Elderbc534d82012-01-29 13:57:44 -0600410 mutex_unlock(&ctl_mutex);
411
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700412 dout("rbd_client_create created %p\n", rbdc);
413 return rbdc;
414
415out_err:
416 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600417out_mutex:
418 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419 kfree(rbdc);
420out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500421 if (ceph_opts)
422 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400423 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700424}
425
426/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700427 * Find a ceph client with specific addr and configuration. If
428 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700429 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700430static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431{
432 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700433 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700434
Alex Elder43ae4702012-07-03 16:01:18 -0500435 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436 return NULL;
437
Alex Elder1f7ba332012-08-10 13:12:07 -0700438 spin_lock(&rbd_client_list_lock);
439 list_for_each_entry(client_node, &rbd_client_list, node) {
440 if (!ceph_compare_options(ceph_opts, client_node->client)) {
441 kref_get(&client_node->kref);
442 found = true;
443 break;
444 }
445 }
446 spin_unlock(&rbd_client_list_lock);
447
448 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700449}
450
451/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700452 * mount options
453 */
454enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700455 Opt_last_int,
456 /* int args above */
457 Opt_last_string,
458 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700459 Opt_read_only,
460 Opt_read_write,
461 /* Boolean args above */
462 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700463};
464
Alex Elder43ae4702012-07-03 16:01:18 -0500465static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700466 /* int args above */
467 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500468 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700469 {Opt_read_only, "ro"}, /* Alternate spelling */
470 {Opt_read_write, "read_write"},
471 {Opt_read_write, "rw"}, /* Alternate spelling */
472 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700473 {-1, NULL}
474};
475
476static int parse_rbd_opts_token(char *c, void *private)
477{
Alex Elder43ae4702012-07-03 16:01:18 -0500478 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700479 substring_t argstr[MAX_OPT_ARGS];
480 int token, intval, ret;
481
Alex Elder43ae4702012-07-03 16:01:18 -0500482 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700483 if (token < 0)
484 return -EINVAL;
485
486 if (token < Opt_last_int) {
487 ret = match_int(&argstr[0], &intval);
488 if (ret < 0) {
489 pr_err("bad mount option arg (not int) "
490 "at '%s'\n", c);
491 return ret;
492 }
493 dout("got int token %d val %d\n", token, intval);
494 } else if (token > Opt_last_int && token < Opt_last_string) {
495 dout("got string token %d val %s\n", token,
496 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700497 } else if (token > Opt_last_string && token < Opt_last_bool) {
498 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700499 } else {
500 dout("got token %d\n", token);
501 }
502
503 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700504 case Opt_read_only:
505 rbd_opts->read_only = true;
506 break;
507 case Opt_read_write:
508 rbd_opts->read_only = false;
509 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700510 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500511 rbd_assert(false);
512 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700513 }
514 return 0;
515}
516
517/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700518 * Get a ceph client with specific addr and configuration, if one does
519 * not exist create it.
520 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500521static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700522{
Alex Elderf8c38922012-08-10 13:12:07 -0700523 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700524
Alex Elder1f7ba332012-08-10 13:12:07 -0700525 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500526 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500527 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500528 else
Alex Elderf8c38922012-08-10 13:12:07 -0700529 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700530
Alex Elder9d3997f2012-10-25 23:34:42 -0500531 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700532}
533
534/*
535 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600536 *
Alex Elder432b8582012-01-29 13:57:44 -0600537 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700538 */
539static void rbd_client_release(struct kref *kref)
540{
541 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
542
543 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500544 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500546 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700547
548 ceph_destroy_client(rbdc->client);
549 kfree(rbdc);
550}
551
552/*
553 * Drop reference to ceph client node. If it's not referenced anymore, release
554 * it.
555 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500556static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557{
Alex Elderc53d5892012-10-25 23:34:42 -0500558 if (rbdc)
559 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700560}
561
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700562/*
563 * Destroy requests collection
564 */
565static void rbd_coll_release(struct kref *kref)
566{
567 struct rbd_req_coll *coll =
568 container_of(kref, struct rbd_req_coll, kref);
569
570 dout("rbd_coll_release %p\n", coll);
571 kfree(coll);
572}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700573
Alex Eldera30b71b2012-07-10 20:30:11 -0500574static bool rbd_image_format_valid(u32 image_format)
575{
576 return image_format == 1 || image_format == 2;
577}
578
Alex Elder8e94af82012-07-25 09:32:40 -0500579static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
580{
Alex Elder103a1502012-08-02 11:29:45 -0500581 size_t size;
582 u32 snap_count;
583
584 /* The header has to start with the magic rbd header text */
585 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
586 return false;
587
Alex Elderdb2388b2012-10-20 22:17:27 -0500588 /* The bio layer requires at least sector-sized I/O */
589
590 if (ondisk->options.order < SECTOR_SHIFT)
591 return false;
592
593 /* If we use u64 in a few spots we may be able to loosen this */
594
595 if (ondisk->options.order > 8 * sizeof (int) - 1)
596 return false;
597
Alex Elder103a1502012-08-02 11:29:45 -0500598 /*
599 * The size of a snapshot header has to fit in a size_t, and
600 * that limits the number of snapshots.
601 */
602 snap_count = le32_to_cpu(ondisk->snap_count);
603 size = SIZE_MAX - sizeof (struct ceph_snap_context);
604 if (snap_count > size / sizeof (__le64))
605 return false;
606
607 /*
608 * Not only that, but the size of the entire the snapshot
609 * header must also be representable in a size_t.
610 */
611 size -= snap_count * sizeof (__le64);
612 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
613 return false;
614
615 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500616}
617
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700618/*
619 * Create a new header structure, translate header format from the on-disk
620 * header.
621 */
622static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500623 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624{
Alex Elderccece232012-07-10 20:30:10 -0500625 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500626 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500627 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500628 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629
Alex Elder6a523252012-07-19 17:12:59 -0500630 memset(header, 0, sizeof (*header));
631
Alex Elder103a1502012-08-02 11:29:45 -0500632 snap_count = le32_to_cpu(ondisk->snap_count);
633
Alex Elder58c17b02012-08-23 23:22:06 -0500634 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
635 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500636 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700637 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500638 memcpy(header->object_prefix, ondisk->object_prefix, len);
639 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600640
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700641 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500642 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
643
Alex Elder621901d2012-08-23 23:22:06 -0500644 /* Save a copy of the snapshot names */
645
Alex Elderf785cc12012-08-23 23:22:06 -0500646 if (snap_names_len > (u64) SIZE_MAX)
647 return -EIO;
648 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700649 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500650 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500651 /*
652 * Note that rbd_dev_v1_header_read() guarantees
653 * the ondisk buffer we're working with has
654 * snap_names_len bytes beyond the end of the
655 * snapshot id array, this memcpy() is safe.
656 */
657 memcpy(header->snap_names, &ondisk->snaps[snap_count],
658 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500659
Alex Elder621901d2012-08-23 23:22:06 -0500660 /* Record each snapshot's size */
661
Alex Elderd2bb24e2012-07-26 23:37:14 -0500662 size = snap_count * sizeof (*header->snap_sizes);
663 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700664 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500665 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500666 for (i = 0; i < snap_count; i++)
667 header->snap_sizes[i] =
668 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 } else {
Alex Elderccece232012-07-10 20:30:10 -0500670 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671 header->snap_names = NULL;
672 header->snap_sizes = NULL;
673 }
Alex Elder849b4262012-07-09 21:04:24 -0500674
Alex Elder34b13182012-07-13 20:35:12 -0500675 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700676 header->obj_order = ondisk->options.order;
677 header->crypt_type = ondisk->options.crypt_type;
678 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500679
Alex Elder621901d2012-08-23 23:22:06 -0500680 /* Allocate and fill in the snapshot context */
681
Alex Elderf84344f2012-08-31 17:29:51 -0500682 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500683 size = sizeof (struct ceph_snap_context);
684 size += snap_count * sizeof (header->snapc->snaps[0]);
685 header->snapc = kzalloc(size, GFP_KERNEL);
686 if (!header->snapc)
687 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700688
689 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500690 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700691 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500692 for (i = 0; i < snap_count; i++)
693 header->snapc->snaps[i] =
694 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695
696 return 0;
697
Alex Elder6a523252012-07-19 17:12:59 -0500698out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500699 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500700 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700701 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500702 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500703 kfree(header->object_prefix);
704 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500705
Alex Elder00f1f362012-02-07 12:03:36 -0600706 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707}
708
Alex Elder9e15b772012-10-30 19:40:33 -0500709static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
710{
711 struct rbd_snap *snap;
712
713 if (snap_id == CEPH_NOSNAP)
714 return RBD_SNAP_HEAD_NAME;
715
716 list_for_each_entry(snap, &rbd_dev->snaps, node)
717 if (snap_id == snap->id)
718 return snap->name;
719
720 return NULL;
721}
722
Alex Elder8836b992012-08-30 14:42:15 -0500723static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700724{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700725
Alex Eldere86924a2012-07-10 20:30:11 -0500726 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600727
Alex Eldere86924a2012-07-10 20:30:11 -0500728 list_for_each_entry(snap, &rbd_dev->snaps, node) {
729 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500730 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500731 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500732 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600733
Alex Eldere86924a2012-07-10 20:30:11 -0500734 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600735 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700736 }
Alex Eldere86924a2012-07-10 20:30:11 -0500737
Alex Elder00f1f362012-02-07 12:03:36 -0600738 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700739}
740
Alex Elder819d52b2012-10-25 23:34:41 -0500741static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742{
Alex Elder78dc4472012-07-19 08:49:18 -0500743 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500745 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800746 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500747 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500748 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500749 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500750 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500752 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753 if (ret < 0)
754 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500755 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700756 }
Alex Elderd78b6502012-11-09 08:43:15 -0600757 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700759 return ret;
760}
761
762static void rbd_header_free(struct rbd_image_header *header)
763{
Alex Elder849b4262012-07-09 21:04:24 -0500764 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500765 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700766 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500767 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500768 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500769 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800770 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500771 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700772}
773
Alex Elder65ccfe22012-08-09 10:33:26 -0700774static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700775{
Alex Elder65ccfe22012-08-09 10:33:26 -0700776 char *name;
777 u64 segment;
778 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779
Alex Elder2fd82b92012-11-09 15:05:54 -0600780 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700781 if (!name)
782 return NULL;
783 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600784 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700785 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600786 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700787 pr_err("error formatting segment name for #%llu (%d)\n",
788 segment, ret);
789 kfree(name);
790 name = NULL;
791 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700792
Alex Elder65ccfe22012-08-09 10:33:26 -0700793 return name;
794}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700795
Alex Elder65ccfe22012-08-09 10:33:26 -0700796static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
797{
798 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700799
Alex Elder65ccfe22012-08-09 10:33:26 -0700800 return offset & (segment_size - 1);
801}
802
803static u64 rbd_segment_length(struct rbd_device *rbd_dev,
804 u64 offset, u64 length)
805{
806 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
807
808 offset &= segment_size - 1;
809
Alex Elderaafb230e2012-09-06 16:00:54 -0500810 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700811 if (offset + length > segment_size)
812 length = segment_size - offset;
813
814 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700815}
816
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700817static int rbd_get_num_segments(struct rbd_image_header *header,
818 u64 ofs, u64 len)
819{
Alex Elderdf111be2012-08-09 10:33:26 -0700820 u64 start_seg;
821 u64 end_seg;
822
823 if (!len)
824 return 0;
825 if (len - 1 > U64_MAX - ofs)
826 return -ERANGE;
827
828 start_seg = ofs >> header->obj_order;
829 end_seg = (ofs + len - 1) >> header->obj_order;
830
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700831 return end_seg - start_seg + 1;
832}
833
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700835 * returns the size of an object in the image
836 */
837static u64 rbd_obj_bytes(struct rbd_image_header *header)
838{
839 return 1 << header->obj_order;
840}
841
842/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700843 * bio helpers
844 */
845
846static void bio_chain_put(struct bio *chain)
847{
848 struct bio *tmp;
849
850 while (chain) {
851 tmp = chain;
852 chain = chain->bi_next;
853 bio_put(tmp);
854 }
855}
856
857/*
858 * zeros a bio chain, starting at specific offset
859 */
860static void zero_bio_chain(struct bio *chain, int start_ofs)
861{
862 struct bio_vec *bv;
863 unsigned long flags;
864 void *buf;
865 int i;
866 int pos = 0;
867
868 while (chain) {
869 bio_for_each_segment(bv, chain, i) {
870 if (pos + bv->bv_len > start_ofs) {
871 int remainder = max(start_ofs - pos, 0);
872 buf = bvec_kmap_irq(bv, &flags);
873 memset(buf + remainder, 0,
874 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200875 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876 }
877 pos += bv->bv_len;
878 }
879
880 chain = chain->bi_next;
881 }
882}
883
884/*
Alex Elderf7760da2012-10-20 22:17:27 -0500885 * Clone a portion of a bio, starting at the given byte offset
886 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 */
Alex Elderf7760da2012-10-20 22:17:27 -0500888static struct bio *bio_clone_range(struct bio *bio_src,
889 unsigned int offset,
890 unsigned int len,
891 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892{
Alex Elderf7760da2012-10-20 22:17:27 -0500893 struct bio_vec *bv;
894 unsigned int resid;
895 unsigned short idx;
896 unsigned int voff;
897 unsigned short end_idx;
898 unsigned short vcnt;
899 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700900
Alex Elderf7760da2012-10-20 22:17:27 -0500901 /* Handle the easy case for the caller */
902
903 if (!offset && len == bio_src->bi_size)
904 return bio_clone(bio_src, gfpmask);
905
906 if (WARN_ON_ONCE(!len))
907 return NULL;
908 if (WARN_ON_ONCE(len > bio_src->bi_size))
909 return NULL;
910 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
911 return NULL;
912
913 /* Find first affected segment... */
914
915 resid = offset;
916 __bio_for_each_segment(bv, bio_src, idx, 0) {
917 if (resid < bv->bv_len)
918 break;
919 resid -= bv->bv_len;
920 }
921 voff = resid;
922
923 /* ...and the last affected segment */
924
925 resid += len;
926 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
927 if (resid <= bv->bv_len)
928 break;
929 resid -= bv->bv_len;
930 }
931 vcnt = end_idx - idx + 1;
932
933 /* Build the clone */
934
935 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
936 if (!bio)
937 return NULL; /* ENOMEM */
938
939 bio->bi_bdev = bio_src->bi_bdev;
940 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
941 bio->bi_rw = bio_src->bi_rw;
942 bio->bi_flags |= 1 << BIO_CLONED;
943
944 /*
945 * Copy over our part of the bio_vec, then update the first
946 * and last (or only) entries.
947 */
948 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
949 vcnt * sizeof (struct bio_vec));
950 bio->bi_io_vec[0].bv_offset += voff;
951 if (vcnt > 1) {
952 bio->bi_io_vec[0].bv_len -= voff;
953 bio->bi_io_vec[vcnt - 1].bv_len = resid;
954 } else {
955 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956 }
957
Alex Elderf7760da2012-10-20 22:17:27 -0500958 bio->bi_vcnt = vcnt;
959 bio->bi_size = len;
960 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700961
Alex Elderf7760da2012-10-20 22:17:27 -0500962 return bio;
963}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700964
Alex Elderf7760da2012-10-20 22:17:27 -0500965/*
966 * Clone a portion of a bio chain, starting at the given byte offset
967 * into the first bio in the source chain and continuing for the
968 * number of bytes indicated. The result is another bio chain of
969 * exactly the given length, or a null pointer on error.
970 *
971 * The bio_src and offset parameters are both in-out. On entry they
972 * refer to the first source bio and the offset into that bio where
973 * the start of data to be cloned is located.
974 *
975 * On return, bio_src is updated to refer to the bio in the source
976 * chain that contains first un-cloned byte, and *offset will
977 * contain the offset of that byte within that bio.
978 */
979static struct bio *bio_chain_clone_range(struct bio **bio_src,
980 unsigned int *offset,
981 unsigned int len,
982 gfp_t gfpmask)
983{
984 struct bio *bi = *bio_src;
985 unsigned int off = *offset;
986 struct bio *chain = NULL;
987 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700988
Alex Elderf7760da2012-10-20 22:17:27 -0500989 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700990
Alex Elderf7760da2012-10-20 22:17:27 -0500991 if (!bi || off >= bi->bi_size || !len)
992 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700993
Alex Elderf7760da2012-10-20 22:17:27 -0500994 end = &chain;
995 while (len) {
996 unsigned int bi_size;
997 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700998
Alex Elderf5400b72012-11-01 10:17:15 -0500999 if (!bi) {
1000 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001001 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001002 }
Alex Elderf7760da2012-10-20 22:17:27 -05001003 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1004 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1005 if (!bio)
1006 goto out_err; /* ENOMEM */
1007
1008 *end = bio;
1009 end = &bio->bi_next;
1010
1011 off += bi_size;
1012 if (off == bi->bi_size) {
1013 bi = bi->bi_next;
1014 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015 }
Alex Elderf7760da2012-10-20 22:17:27 -05001016 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017 }
Alex Elderf7760da2012-10-20 22:17:27 -05001018 *bio_src = bi;
1019 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001020
Alex Elderf7760da2012-10-20 22:17:27 -05001021 return chain;
1022out_err:
1023 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001025 return NULL;
1026}
1027
Alex Elder8d23bf22012-11-19 22:55:21 -06001028static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001029{
Alex Elder139b4312012-11-13 21:11:15 -06001030 struct ceph_osd_req_op *op;
Alex Elder57cfc102012-06-26 12:57:03 -07001031
Alex Elder139b4312012-11-13 21:11:15 -06001032 op = kzalloc(sizeof (*op), GFP_NOIO);
1033 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001034 return NULL;
Alex Elder8d23bf22012-11-19 22:55:21 -06001035
Alex Elder139b4312012-11-13 21:11:15 -06001036 op->op = opcode;
Alex Elder57cfc102012-06-26 12:57:03 -07001037
Alex Elder139b4312012-11-13 21:11:15 -06001038 return op;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001039}
1040
Alex Elder139b4312012-11-13 21:11:15 -06001041static void rbd_destroy_op(struct ceph_osd_req_op *op)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001042{
Alex Elder139b4312012-11-13 21:11:15 -06001043 kfree(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001044}
1045
Alex Elder8d23bf22012-11-19 22:55:21 -06001046struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1047{
1048 struct ceph_osd_req_op *op;
1049 va_list args;
1050
1051 op = kzalloc(sizeof (*op), GFP_NOIO);
1052 if (!op)
1053 return NULL;
1054 op->op = opcode;
1055 va_start(args, opcode);
1056 switch (opcode) {
1057 case CEPH_OSD_OP_READ:
1058 case CEPH_OSD_OP_WRITE:
1059 /* rbd_osd_req_op_create(READ, offset, length) */
1060 /* rbd_osd_req_op_create(WRITE, offset, length) */
1061 op->extent.offset = va_arg(args, u64);
1062 op->extent.length = va_arg(args, u64);
1063 if (opcode == CEPH_OSD_OP_WRITE)
1064 op->payload_len = op->extent.length;
1065 break;
1066 default:
1067 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1068 kfree(op);
1069 op = NULL;
1070 break;
1071 }
1072 va_end(args);
1073
1074 return op;
1075}
1076
1077static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1078{
1079 kfree(op);
1080}
1081
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001082static void rbd_coll_end_req_index(struct request *rq,
1083 struct rbd_req_coll *coll,
1084 int index,
Alex Elder8986cb32012-11-08 08:01:39 -06001085 s32 ret, u64 len)
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001086{
1087 struct request_queue *q;
1088 int min, max, i;
1089
Alex Elderbd919d42012-07-13 20:35:11 -05001090 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
Alex Elder8986cb32012-11-08 08:01:39 -06001091 coll, index, (int)ret, (unsigned long long)len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001092
1093 if (!rq)
1094 return;
1095
1096 if (!coll) {
1097 blk_end_request(rq, ret, len);
1098 return;
1099 }
1100
1101 q = rq->q;
1102
1103 spin_lock_irq(q->queue_lock);
1104 coll->status[index].done = 1;
1105 coll->status[index].rc = ret;
1106 coll->status[index].bytes = len;
1107 max = min = coll->num_done;
1108 while (max < coll->total && coll->status[max].done)
1109 max++;
1110
1111 for (i = min; i<max; i++) {
Alex Elder8986cb32012-11-08 08:01:39 -06001112 __blk_end_request(rq, (int)coll->status[i].rc,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001113 coll->status[i].bytes);
1114 coll->num_done++;
1115 kref_put(&coll->kref, rbd_coll_release);
1116 }
1117 spin_unlock_irq(q->queue_lock);
1118}
1119
Alex Elder725afc92012-11-08 08:01:39 -06001120static void rbd_coll_end_req(struct rbd_request *rbd_req,
Alex Elder8986cb32012-11-08 08:01:39 -06001121 s32 ret, u64 len)
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001122{
Alex Elder725afc92012-11-08 08:01:39 -06001123 rbd_coll_end_req_index(rbd_req->rq,
1124 rbd_req->coll, rbd_req->coll_index,
1125 ret, len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001126}
1127
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001128/*
1129 * Send ceph osd request
1130 */
1131static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001132 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133 struct ceph_snap_context *snapc,
1134 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001135 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001136 struct bio *bio,
1137 struct page **pages,
1138 int num_pages,
1139 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001140 struct ceph_osd_req_op *op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001141 struct rbd_req_coll *coll,
1142 int coll_index,
Alex Elder5f29ddd2012-11-08 08:01:39 -06001143 void (*rbd_cb)(struct ceph_osd_request *,
1144 struct ceph_msg *),
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001145 struct ceph_osd_request **linger_req,
1146 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001147{
Alex Elder1dbb4392012-01-24 10:08:37 -06001148 struct ceph_osd_client *osdc;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001149 struct ceph_osd_request *osd_req;
1150 struct rbd_request *rbd_req = NULL;
1151 struct timespec mtime = CURRENT_TIME;
1152 int ret;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001153
Alex Elderf7760da2012-10-20 22:17:27 -05001154 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1155 object_name, (unsigned long long) ofs,
1156 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001157
Alex Elder0ce1a792012-07-03 16:01:18 -05001158 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder30573d62012-11-13 21:11:15 -06001159 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001160 if (!osd_req)
1161 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162
Alex Elderd178a9e2012-11-13 21:11:15 -06001163 osd_req->r_flags = flags;
Alex Elder54a54002012-11-13 21:11:15 -06001164 osd_req->r_pages = pages;
1165 if (bio) {
1166 osd_req->r_bio = bio;
1167 bio_get(osd_req->r_bio);
1168 }
Alex Elder2e53c6c2012-11-30 09:59:47 -06001169
Alex Elder18216652012-11-30 09:59:47 -06001170 if (coll) {
Alex Elder2e53c6c2012-11-30 09:59:47 -06001171 ret = -ENOMEM;
1172 rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
1173 if (!rbd_req)
1174 goto done_osd_req;
1175
1176 rbd_req->rq = rq;
1177 rbd_req->bio = bio;
1178 rbd_req->pages = pages;
1179 rbd_req->len = len;
1180 rbd_req->coll = coll;
Alex Elder18216652012-11-30 09:59:47 -06001181 rbd_req->coll_index = coll_index;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001182 }
1183
Alex Elder5f29ddd2012-11-08 08:01:39 -06001184 osd_req->r_callback = rbd_cb;
Alex Elder5f29ddd2012-11-08 08:01:39 -06001185 osd_req->r_priv = rbd_req;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001186
Alex Elder5f29ddd2012-11-08 08:01:39 -06001187 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1188 osd_req->r_oid_len = strlen(osd_req->r_oid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001189
Alex Elder0903e872012-11-14 12:25:19 -06001190 osd_req->r_file_layout = rbd_dev->layout; /* struct */
Alex Eldere01e7922012-11-14 12:25:18 -06001191 osd_req->r_num_pages = calc_pages_for(ofs, len);
1192 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001193
Alex Elder30573d62012-11-13 21:11:15 -06001194 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
Alex Elderae7ca4a32012-11-13 21:11:15 -06001195 snapc, snapid, &mtime);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001196
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001197 if (linger_req) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001198 ceph_osdc_set_request_linger(osdc, osd_req);
1199 *linger_req = osd_req;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001200 }
1201
Alex Elder5f29ddd2012-11-08 08:01:39 -06001202 ret = ceph_osdc_start_request(osdc, osd_req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001203 if (ret < 0)
1204 goto done_err;
1205
1206 if (!rbd_cb) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001207 u64 version;
1208
1209 ret = ceph_osdc_wait_request(osdc, osd_req);
1210 version = le64_to_cpu(osd_req->r_reassert_version.version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001211 if (ver)
Alex Elder5f29ddd2012-11-08 08:01:39 -06001212 *ver = version;
1213 dout("reassert_ver=%llu\n", (unsigned long long) version);
1214 ceph_osdc_put_request(osd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215 }
1216 return ret;
1217
1218done_err:
Alex Elder2e53c6c2012-11-30 09:59:47 -06001219 if (bio)
1220 bio_chain_put(osd_req->r_bio);
Alex Elder725afc92012-11-08 08:01:39 -06001221 kfree(rbd_req);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001222done_osd_req:
1223 ceph_osdc_put_request(osd_req);
1224
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001225 return ret;
1226}
1227
1228/*
1229 * Ceph osd op callback
1230 */
Alex Elder5f29ddd2012-11-08 08:01:39 -06001231static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001232{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001233 struct rbd_request *rbd_req = osd_req->r_priv;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 struct ceph_osd_reply_head *replyhead;
1235 struct ceph_osd_op *op;
Alex Elder8986cb32012-11-08 08:01:39 -06001236 s32 rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001237 u64 bytes;
1238 int read_op;
1239
1240 /* parse reply */
1241 replyhead = msg->front.iov_base;
1242 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1243 op = (void *)(replyhead + 1);
Alex Elder8986cb32012-11-08 08:01:39 -06001244 rc = (s32)le32_to_cpu(replyhead->result);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001245 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001246 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001247
Alex Elderbd919d42012-07-13 20:35:11 -05001248 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1249 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001250
Alex Elder8986cb32012-11-08 08:01:39 -06001251 if (rc == (s32)-ENOENT && read_op) {
Alex Elder725afc92012-11-08 08:01:39 -06001252 zero_bio_chain(rbd_req->bio, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001253 rc = 0;
Alex Elder725afc92012-11-08 08:01:39 -06001254 } else if (rc == 0 && read_op && bytes < rbd_req->len) {
1255 zero_bio_chain(rbd_req->bio, bytes);
1256 bytes = rbd_req->len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001257 }
1258
Alex Elder725afc92012-11-08 08:01:39 -06001259 rbd_coll_end_req(rbd_req, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001260
Alex Elder725afc92012-11-08 08:01:39 -06001261 if (rbd_req->bio)
1262 bio_chain_put(rbd_req->bio);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001263
Alex Elder5f29ddd2012-11-08 08:01:39 -06001264 ceph_osdc_put_request(osd_req);
Alex Elder725afc92012-11-08 08:01:39 -06001265 kfree(rbd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001266}
1267
Alex Elder5f29ddd2012-11-08 08:01:39 -06001268static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1269 struct ceph_msg *msg)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001270{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001271 ceph_osdc_put_request(osd_req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001272}
1273
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001274/*
1275 * Do a synchronous ceph osd operation
1276 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001277static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001278 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001279 struct ceph_osd_req_op *op,
Alex Elderaded07e2012-07-03 16:01:18 -05001280 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001281 u64 ofs, u64 inbound_size,
1282 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001283 struct ceph_osd_request **linger_req,
1284 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001285{
1286 int ret;
1287 struct page **pages;
1288 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001289
Alex Elder30573d62012-11-13 21:11:15 -06001290 rbd_assert(op != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001291
Alex Elderf8d4de62012-07-03 16:01:19 -05001292 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001293 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001294 if (IS_ERR(pages))
1295 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001296
Alex Elder25704ac2012-11-09 08:43:16 -06001297 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderf8d4de62012-07-03 16:01:19 -05001298 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001299 pages, num_pages,
1300 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001301 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001302 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001303 NULL,
1304 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001305 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001306 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001307
Alex Elderf8d4de62012-07-03 16:01:19 -05001308 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1309 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001310
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001311done:
1312 ceph_release_page_vector(pages, num_pages);
1313 return ret;
1314}
1315
1316/*
1317 * Do an asynchronous ceph osd operation
1318 */
1319static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001320 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001321 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001322 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001323 struct bio *bio,
1324 struct rbd_req_coll *coll,
1325 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001326{
1327 char *seg_name;
1328 u64 seg_ofs;
1329 u64 seg_len;
1330 int ret;
Alex Elder139b4312012-11-13 21:11:15 -06001331 struct ceph_osd_req_op *op;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001332 int opcode;
1333 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001334 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001335
Alex Elder65ccfe22012-08-09 10:33:26 -07001336 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001337 if (!seg_name)
1338 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001339 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1340 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001341
Alex Elderff2e4bb2012-10-10 18:59:29 -07001342 if (rq_data_dir(rq) == WRITE) {
1343 opcode = CEPH_OSD_OP_WRITE;
1344 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001345 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001346 } else {
1347 opcode = CEPH_OSD_OP_READ;
1348 flags = CEPH_OSD_FLAG_READ;
Alex Eldera7b4c652012-11-09 08:43:15 -06001349 rbd_assert(!snapc);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001350 snapid = rbd_dev->spec->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001351 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001352
Alex Elder57cfc102012-06-26 12:57:03 -07001353 ret = -ENOMEM;
Alex Elder8d23bf22012-11-19 22:55:21 -06001354 op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
Alex Elder139b4312012-11-13 21:11:15 -06001355 if (!op)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001356 goto done;
1357
1358 /* we've taken care of segment sizes earlier when we
1359 cloned the bios. We should never have a segment
1360 truncated at this point */
Alex Elderaafb230e2012-09-06 16:00:54 -05001361 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001362
1363 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1364 seg_name, seg_ofs, seg_len,
1365 bio,
1366 NULL, 0,
1367 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001368 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001369 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001370 rbd_req_cb, 0, NULL);
Alex Eldercd323ac2012-11-08 08:01:39 -06001371 if (ret < 0)
1372 rbd_coll_end_req_index(rq, coll, coll_index,
1373 (s32)ret, seg_len);
Alex Elder8d23bf22012-11-19 22:55:21 -06001374 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001375done:
1376 kfree(seg_name);
1377 return ret;
1378}
1379
1380/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001381 * Request sync osd read
1382 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001383static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001384 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001385 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386 char *buf,
1387 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001388{
Alex Elder139b4312012-11-13 21:11:15 -06001389 struct ceph_osd_req_op *op;
Alex Elder913d2fd2012-06-26 12:57:03 -07001390 int ret;
1391
Alex Elder8d23bf22012-11-19 22:55:21 -06001392 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
Alex Elder139b4312012-11-13 21:11:15 -06001393 if (!op)
Alex Elder913d2fd2012-06-26 12:57:03 -07001394 return -ENOMEM;
1395
Alex Elder25704ac2012-11-09 08:43:16 -06001396 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
Alex Elder30573d62012-11-13 21:11:15 -06001397 op, object_name, ofs, len, buf, NULL, ver);
Alex Elder8d23bf22012-11-19 22:55:21 -06001398 rbd_osd_req_op_destroy(op);
Alex Elder913d2fd2012-06-26 12:57:03 -07001399
1400 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001401}
1402
1403/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001404 * Request sync osd watch
1405 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001406static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001407 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001408 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001409{
Alex Elder139b4312012-11-13 21:11:15 -06001410 struct ceph_osd_req_op *op;
Sage Weil11f77002011-05-12 16:13:54 -07001411 int ret;
1412
Alex Elder8d23bf22012-11-19 22:55:21 -06001413 op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0);
Alex Elder139b4312012-11-13 21:11:15 -06001414 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001415 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001416
Alex Elder139b4312012-11-13 21:11:15 -06001417 op->watch.ver = cpu_to_le64(ver);
1418 op->watch.cookie = notify_id;
1419 op->watch.flag = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001420
Alex Elder0ce1a792012-07-03 16:01:18 -05001421 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001422 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001423 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001424 CEPH_OSD_FLAG_READ,
Alex Elder30573d62012-11-13 21:11:15 -06001425 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001426 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001427 rbd_simple_req_cb, 0, NULL);
1428
Alex Elder139b4312012-11-13 21:11:15 -06001429 rbd_destroy_op(op);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001430 return ret;
1431}
1432
1433static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1434{
Alex Elder0ce1a792012-07-03 16:01:18 -05001435 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001436 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001437 int rc;
1438
Alex Elder0ce1a792012-07-03 16:01:18 -05001439 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001440 return;
1441
Alex Elderbd919d42012-07-13 20:35:11 -05001442 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1443 rbd_dev->header_name, (unsigned long long) notify_id,
1444 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001445 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001446 if (rc)
Alex Elder06ecc6c2012-11-01 10:17:15 -05001447 rbd_warn(rbd_dev, "got notification but failed to "
1448 " update snaps: %d\n", rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001449
Alex Elder7f0a24d2012-07-25 09:32:40 -05001450 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001451}
1452
1453/*
Alex Elder907703d2012-11-13 21:11:15 -06001454 * Request sync osd watch/unwatch. The value of "start" determines
1455 * whether a watch request is being initiated or torn down.
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001456 */
Alex Elder907703d2012-11-13 21:11:15 -06001457static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001458{
Alex Elder139b4312012-11-13 21:11:15 -06001459 struct ceph_osd_req_op *op;
Alex Elder907703d2012-11-13 21:11:15 -06001460 struct ceph_osd_request **linger_req = NULL;
1461 __le64 version = 0;
Alex Elder57cfc102012-06-26 12:57:03 -07001462 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001463
Alex Elder8d23bf22012-11-19 22:55:21 -06001464 op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0);
Alex Elder139b4312012-11-13 21:11:15 -06001465 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001466 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001467
Alex Elder907703d2012-11-13 21:11:15 -06001468 if (start) {
1469 struct ceph_osd_client *osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001470
Alex Elder907703d2012-11-13 21:11:15 -06001471 osdc = &rbd_dev->rbd_client->client->osdc;
1472 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1473 &rbd_dev->watch_event);
1474 if (ret < 0)
1475 goto done;
1476 version = cpu_to_le64(rbd_dev->header.obj_version);
1477 linger_req = &rbd_dev->watch_request;
1478 }
1479
1480 op->watch.ver = version;
Alex Elder139b4312012-11-13 21:11:15 -06001481 op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Alex Elder907703d2012-11-13 21:11:15 -06001482 op->watch.flag = (u8) start ? 1 : 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001483
Alex Elder25704ac2012-11-09 08:43:16 -06001484 ret = rbd_req_sync_op(rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001485 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Alex Elder907703d2012-11-13 21:11:15 -06001486 op, rbd_dev->header_name,
1487 0, 0, NULL, linger_req, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001488
Alex Elder907703d2012-11-13 21:11:15 -06001489 if (!start || ret < 0) {
1490 ceph_osdc_cancel_event(rbd_dev->watch_event);
1491 rbd_dev->watch_event = NULL;
1492 }
1493done:
Alex Elder139b4312012-11-13 21:11:15 -06001494 rbd_destroy_op(op);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001495
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001496 return ret;
1497}
1498
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001499/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001500 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001501 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001502static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001503 const char *object_name,
1504 const char *class_name,
1505 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001506 const char *outbound,
1507 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001508 char *inbound,
1509 size_t inbound_size,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001510 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511{
Alex Elder139b4312012-11-13 21:11:15 -06001512 struct ceph_osd_req_op *op;
Alex Elderaded07e2012-07-03 16:01:18 -05001513 int class_name_len = strlen(class_name);
1514 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001515 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001516 int ret;
1517
Alex Elder3cb4a682012-06-26 12:57:03 -07001518 /*
1519 * Any input parameters required by the method we're calling
1520 * will be sent along with the class and method names as
1521 * part of the message payload. That data and its size are
1522 * supplied via the indata and indata_len fields (named from
1523 * the perspective of the server side) in the OSD request
1524 * operation.
1525 */
1526 payload_size = class_name_len + method_name_len + outbound_size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001527 op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0);
Alex Elder139b4312012-11-13 21:11:15 -06001528 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001529 return -ENOMEM;
Alex Elder8d23bf22012-11-19 22:55:21 -06001530 op->payload_len = payload_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001531
Alex Elder139b4312012-11-13 21:11:15 -06001532 op->cls.class_name = class_name;
1533 op->cls.class_len = (__u8) class_name_len;
1534 op->cls.method_name = method_name;
1535 op->cls.method_len = (__u8) method_name_len;
1536 op->cls.argc = 0;
1537 op->cls.indata = outbound;
1538 op->cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539
Alex Elder30573d62012-11-13 21:11:15 -06001540 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
Alex Elderf8d4de62012-07-03 16:01:19 -05001541 object_name, 0, inbound_size, inbound,
1542 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543
Alex Elder139b4312012-11-13 21:11:15 -06001544 rbd_destroy_op(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001545
1546 dout("cls_exec returned %d\n", ret);
1547 return ret;
1548}
1549
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001550static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1551{
1552 struct rbd_req_coll *coll =
1553 kzalloc(sizeof(struct rbd_req_coll) +
1554 sizeof(struct rbd_req_status) * num_reqs,
1555 GFP_ATOMIC);
1556
1557 if (!coll)
1558 return NULL;
1559 coll->total = num_reqs;
1560 kref_init(&coll->kref);
1561 return coll;
1562}
1563
Alex Elder8295cda2012-11-08 08:01:39 -06001564static int rbd_dev_do_request(struct request *rq,
1565 struct rbd_device *rbd_dev,
1566 struct ceph_snap_context *snapc,
1567 u64 ofs, unsigned int size,
1568 struct bio *bio_chain)
1569{
1570 int num_segs;
1571 struct rbd_req_coll *coll;
1572 unsigned int bio_offset;
1573 int cur_seg = 0;
1574
1575 dout("%s 0x%x bytes at 0x%llx\n",
1576 rq_data_dir(rq) == WRITE ? "write" : "read",
1577 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1578
1579 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1580 if (num_segs <= 0)
1581 return num_segs;
1582
1583 coll = rbd_alloc_coll(num_segs);
1584 if (!coll)
1585 return -ENOMEM;
1586
1587 bio_offset = 0;
1588 do {
1589 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1590 unsigned int clone_size;
1591 struct bio *bio_clone;
1592
1593 BUG_ON(limit > (u64)UINT_MAX);
1594 clone_size = (unsigned int)limit;
1595 dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
1596
1597 kref_get(&coll->kref);
1598
1599 /* Pass a cloned bio chain via an osd request */
1600
1601 bio_clone = bio_chain_clone_range(&bio_chain,
1602 &bio_offset, clone_size,
1603 GFP_ATOMIC);
1604 if (bio_clone)
1605 (void)rbd_do_op(rq, rbd_dev, snapc,
1606 ofs, clone_size,
1607 bio_clone, coll, cur_seg);
1608 else
1609 rbd_coll_end_req_index(rq, coll, cur_seg,
1610 (s32)-ENOMEM,
1611 clone_size);
1612 size -= clone_size;
1613 ofs += clone_size;
1614
1615 cur_seg++;
1616 } while (size > 0);
1617 kref_put(&coll->kref, rbd_coll_release);
1618
1619 return 0;
1620}
1621
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001622/*
1623 * block device queue callback
1624 */
1625static void rbd_rq_fn(struct request_queue *q)
1626{
1627 struct rbd_device *rbd_dev = q->queuedata;
Alex Elderb395e8b2012-11-08 08:01:39 -06001628 bool read_only = rbd_dev->mapping.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001629 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001630
Alex Elder00f1f362012-02-07 12:03:36 -06001631 while ((rq = blk_fetch_request(q))) {
Alex Elderb395e8b2012-11-08 08:01:39 -06001632 struct ceph_snap_context *snapc = NULL;
1633 unsigned int size = 0;
Alex Elder8295cda2012-11-08 08:01:39 -06001634 int result;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001635
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001636 dout("fetched request\n");
1637
Alex Elderb395e8b2012-11-08 08:01:39 -06001638 /* Filter out block requests we don't understand */
1639
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001640 if ((rq->cmd_type != REQ_TYPE_FS)) {
1641 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001642 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001643 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001644 spin_unlock_irq(q->queue_lock);
1645
Alex Eldera7b4c652012-11-09 08:43:15 -06001646 /* Write requests need a reference to the snapshot context */
Alex Elderb395e8b2012-11-08 08:01:39 -06001647
Alex Eldera7b4c652012-11-09 08:43:15 -06001648 if (rq_data_dir(rq) == WRITE) {
1649 result = -EROFS;
1650 if (read_only) /* Can't write to a read-only device */
1651 goto out_end_request;
Alex Elderb395e8b2012-11-08 08:01:39 -06001652
Alex Eldera7b4c652012-11-09 08:43:15 -06001653 /*
1654 * Note that each osd request will take its
1655 * own reference to the snapshot context
1656 * supplied. The reference we take here
1657 * just guarantees the one we provide stays
1658 * valid.
1659 */
1660 down_read(&rbd_dev->header_rwsem);
Alex Elderb395e8b2012-11-08 08:01:39 -06001661 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
Alex Eldera7b4c652012-11-09 08:43:15 -06001662 up_read(&rbd_dev->header_rwsem);
Alex Elderb395e8b2012-11-08 08:01:39 -06001663 rbd_assert(snapc != NULL);
Alex Eldera7b4c652012-11-09 08:43:15 -06001664 } else if (!atomic_read(&rbd_dev->exists)) {
Alex Elderb395e8b2012-11-08 08:01:39 -06001665 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1666 dout("request for non-existent snapshot");
1667 result = -ENXIO;
1668 goto out_end_request;
1669 }
Alex Elderf7760da2012-10-20 22:17:27 -05001670
Alex Elderb395e8b2012-11-08 08:01:39 -06001671 size = blk_rq_bytes(rq);
1672 result = rbd_dev_do_request(rq, rbd_dev, snapc,
1673 blk_rq_pos(rq) * SECTOR_SIZE,
1674 size, rq->bio);
1675out_end_request:
Alex Eldera7b4c652012-11-09 08:43:15 -06001676 if (snapc)
1677 ceph_put_snap_context(snapc);
Alex Elder8295cda2012-11-08 08:01:39 -06001678 spin_lock_irq(q->queue_lock);
1679 if (!size || result < 0)
1680 __blk_end_request_all(rq, result);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001681 }
1682}
1683
1684/*
1685 * a queue callback. Makes sure that we don't create a bio that spans across
1686 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001687 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001688 */
1689static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1690 struct bio_vec *bvec)
1691{
1692 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001693 sector_t sector_offset;
1694 sector_t sectors_per_obj;
1695 sector_t obj_sector_offset;
1696 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001697
Alex Eldere5cfeed22012-10-20 22:17:27 -05001698 /*
1699 * Find how far into its rbd object the partition-relative
1700 * bio start sector is to offset relative to the enclosing
1701 * device.
1702 */
1703 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1704 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1705 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001706
Alex Eldere5cfeed22012-10-20 22:17:27 -05001707 /*
1708 * Compute the number of bytes from that offset to the end
1709 * of the object. Account for what's already used by the bio.
1710 */
1711 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1712 if (ret > bmd->bi_size)
1713 ret -= bmd->bi_size;
1714 else
1715 ret = 0;
1716
1717 /*
1718 * Don't send back more than was asked for. And if the bio
1719 * was empty, let the whole thing through because: "Note
1720 * that a block device *must* allow a single page to be
1721 * added to an empty bio."
1722 */
1723 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1724 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1725 ret = (int) bvec->bv_len;
1726
1727 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001728}
1729
1730static void rbd_free_disk(struct rbd_device *rbd_dev)
1731{
1732 struct gendisk *disk = rbd_dev->disk;
1733
1734 if (!disk)
1735 return;
1736
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001737 if (disk->flags & GENHD_FL_UP)
1738 del_gendisk(disk);
1739 if (disk->queue)
1740 blk_cleanup_queue(disk->queue);
1741 put_disk(disk);
1742}
1743
1744/*
Alex Elder4156d992012-08-02 11:29:46 -05001745 * Read the complete header for the given rbd device.
1746 *
1747 * Returns a pointer to a dynamically-allocated buffer containing
1748 * the complete and validated header. Caller can pass the address
1749 * of a variable that will be filled in with the version of the
1750 * header object at the time it was read.
1751 *
1752 * Returns a pointer-coded errno if a failure occurs.
1753 */
1754static struct rbd_image_header_ondisk *
1755rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1756{
1757 struct rbd_image_header_ondisk *ondisk = NULL;
1758 u32 snap_count = 0;
1759 u64 names_size = 0;
1760 u32 want_count;
1761 int ret;
1762
1763 /*
1764 * The complete header will include an array of its 64-bit
1765 * snapshot ids, followed by the names of those snapshots as
1766 * a contiguous block of NUL-terminated strings. Note that
1767 * the number of snapshots could change by the time we read
1768 * it in, in which case we re-read it.
1769 */
1770 do {
1771 size_t size;
1772
1773 kfree(ondisk);
1774
1775 size = sizeof (*ondisk);
1776 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1777 size += names_size;
1778 ondisk = kmalloc(size, GFP_KERNEL);
1779 if (!ondisk)
1780 return ERR_PTR(-ENOMEM);
1781
Alex Elder47756182012-11-09 08:43:15 -06001782 ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05001783 0, size,
1784 (char *) ondisk, version);
1785
1786 if (ret < 0)
1787 goto out_err;
1788 if (WARN_ON((size_t) ret < size)) {
1789 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05001790 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
1791 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05001792 goto out_err;
1793 }
1794 if (!rbd_dev_ondisk_valid(ondisk)) {
1795 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05001796 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05001797 goto out_err;
1798 }
1799
1800 names_size = le64_to_cpu(ondisk->snap_names_len);
1801 want_count = snap_count;
1802 snap_count = le32_to_cpu(ondisk->snap_count);
1803 } while (snap_count != want_count);
1804
1805 return ondisk;
1806
1807out_err:
1808 kfree(ondisk);
1809
1810 return ERR_PTR(ret);
1811}
1812
1813/*
1814 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001815 */
1816static int rbd_read_header(struct rbd_device *rbd_dev,
1817 struct rbd_image_header *header)
1818{
Alex Elder4156d992012-08-02 11:29:46 -05001819 struct rbd_image_header_ondisk *ondisk;
1820 u64 ver = 0;
1821 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001822
Alex Elder4156d992012-08-02 11:29:46 -05001823 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1824 if (IS_ERR(ondisk))
1825 return PTR_ERR(ondisk);
1826 ret = rbd_header_from_disk(header, ondisk);
1827 if (ret >= 0)
1828 header->obj_version = ver;
1829 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001830
Alex Elder4156d992012-08-02 11:29:46 -05001831 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832}
1833
Alex Elder41f38c22012-10-25 23:34:40 -05001834static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001835{
1836 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001837 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001838
Alex Eldera0593292012-07-19 09:09:27 -05001839 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001840 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001841}
1842
Alex Elder94785542012-10-09 13:50:17 -07001843static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1844{
1845 sector_t size;
1846
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001847 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001848 return;
1849
1850 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1851 dout("setting size to %llu sectors", (unsigned long long) size);
1852 rbd_dev->mapping.size = (u64) size;
1853 set_capacity(rbd_dev->disk, size);
1854}
1855
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001856/*
1857 * only read the first part of the ondisk header, without the snaps info
1858 */
Alex Elder117973f2012-08-31 17:29:55 -05001859static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001860{
1861 int ret;
1862 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001863
1864 ret = rbd_read_header(rbd_dev, &h);
1865 if (ret < 0)
1866 return ret;
1867
Josh Durgina51aa0c2011-12-05 10:35:04 -08001868 down_write(&rbd_dev->header_rwsem);
1869
Alex Elder94785542012-10-09 13:50:17 -07001870 /* Update image size, and check for resize of mapped image */
1871 rbd_dev->header.image_size = h.image_size;
1872 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001873
Alex Elder849b4262012-07-09 21:04:24 -05001874 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001875 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001876 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001877 /* osd requests may still refer to snapc */
1878 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001879
Alex Elderb8136232012-07-25 09:32:41 -05001880 if (hver)
1881 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001882 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001883 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001884 rbd_dev->header.snapc = h.snapc;
1885 rbd_dev->header.snap_names = h.snap_names;
1886 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001887 /* Free the extra copy of the object prefix */
1888 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1889 kfree(h.object_prefix);
1890
Alex Elder304f6802012-08-31 17:29:52 -05001891 ret = rbd_dev_snaps_update(rbd_dev);
1892 if (!ret)
1893 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001894
Josh Durginc6666012011-11-21 17:11:12 -08001895 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001896
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001897 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001898}
1899
Alex Elder117973f2012-08-31 17:29:55 -05001900static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001901{
1902 int ret;
1903
Alex Elder117973f2012-08-31 17:29:55 -05001904 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001905 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001906 if (rbd_dev->image_format == 1)
1907 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1908 else
1909 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001910 mutex_unlock(&ctl_mutex);
1911
1912 return ret;
1913}
1914
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001915static int rbd_init_disk(struct rbd_device *rbd_dev)
1916{
1917 struct gendisk *disk;
1918 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001919 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001920
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001921 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001922 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1923 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001924 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001925
Alex Elderf0f8cef2012-01-29 13:57:44 -06001926 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001927 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001928 disk->major = rbd_dev->major;
1929 disk->first_minor = 0;
1930 disk->fops = &rbd_bd_ops;
1931 disk->private_data = rbd_dev;
1932
1933 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001934 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1935 if (!q)
1936 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001937
Alex Elder593a9e72012-02-07 12:03:37 -06001938 /* We use the default size, but let's be explicit about it. */
1939 blk_queue_physical_block_size(q, SECTOR_SIZE);
1940
Josh Durgin029bcbd2011-07-22 11:35:23 -07001941 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001942 segment_size = rbd_obj_bytes(&rbd_dev->header);
1943 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1944 blk_queue_max_segment_size(q, segment_size);
1945 blk_queue_io_min(q, segment_size);
1946 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001947
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001948 blk_queue_merge_bvec(q, rbd_merge_bvec);
1949 disk->queue = q;
1950
1951 q->queuedata = rbd_dev;
1952
1953 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001954
Alex Elder12f02942012-08-29 17:11:07 -05001955 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1956
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001957 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001958out_disk:
1959 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001960
1961 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001962}
1963
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001964/*
1965 sysfs
1966*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001967
Alex Elder593a9e72012-02-07 12:03:37 -06001968static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1969{
1970 return container_of(dev, struct rbd_device, dev);
1971}
1972
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001973static ssize_t rbd_size_show(struct device *dev,
1974 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001975{
Alex Elder593a9e72012-02-07 12:03:37 -06001976 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001977 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001978
Josh Durgina51aa0c2011-12-05 10:35:04 -08001979 down_read(&rbd_dev->header_rwsem);
1980 size = get_capacity(rbd_dev->disk);
1981 up_read(&rbd_dev->header_rwsem);
1982
1983 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001984}
1985
Alex Elder34b13182012-07-13 20:35:12 -05001986/*
1987 * Note this shows the features for whatever's mapped, which is not
1988 * necessarily the base image.
1989 */
1990static ssize_t rbd_features_show(struct device *dev,
1991 struct device_attribute *attr, char *buf)
1992{
1993 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1994
1995 return sprintf(buf, "0x%016llx\n",
1996 (unsigned long long) rbd_dev->mapping.features);
1997}
1998
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001999static ssize_t rbd_major_show(struct device *dev,
2000 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002001{
Alex Elder593a9e72012-02-07 12:03:37 -06002002 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002003
2004 return sprintf(buf, "%d\n", rbd_dev->major);
2005}
2006
2007static ssize_t rbd_client_id_show(struct device *dev,
2008 struct device_attribute *attr, char *buf)
2009{
Alex Elder593a9e72012-02-07 12:03:37 -06002010 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002011
Alex Elder1dbb4392012-01-24 10:08:37 -06002012 return sprintf(buf, "client%lld\n",
2013 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002014}
2015
2016static ssize_t rbd_pool_show(struct device *dev,
2017 struct device_attribute *attr, char *buf)
2018{
Alex Elder593a9e72012-02-07 12:03:37 -06002019 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002020
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002021 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002022}
2023
Alex Elder9bb2f332012-07-12 10:46:35 -05002024static ssize_t rbd_pool_id_show(struct device *dev,
2025 struct device_attribute *attr, char *buf)
2026{
2027 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2028
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002029 return sprintf(buf, "%llu\n",
2030 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002031}
2032
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002033static ssize_t rbd_name_show(struct device *dev,
2034 struct device_attribute *attr, char *buf)
2035{
Alex Elder593a9e72012-02-07 12:03:37 -06002036 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002037
Alex Eldera92ffdf2012-10-30 19:40:33 -05002038 if (rbd_dev->spec->image_name)
2039 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2040
2041 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002042}
2043
Alex Elder589d30e2012-07-10 20:30:11 -05002044static ssize_t rbd_image_id_show(struct device *dev,
2045 struct device_attribute *attr, char *buf)
2046{
2047 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2048
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002049 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002050}
2051
Alex Elder34b13182012-07-13 20:35:12 -05002052/*
2053 * Shows the name of the currently-mapped snapshot (or
2054 * RBD_SNAP_HEAD_NAME for the base image).
2055 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002056static ssize_t rbd_snap_show(struct device *dev,
2057 struct device_attribute *attr,
2058 char *buf)
2059{
Alex Elder593a9e72012-02-07 12:03:37 -06002060 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002061
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002062 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002063}
2064
Alex Elder86b00e02012-10-25 23:34:42 -05002065/*
2066 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2067 * for the parent image. If there is no parent, simply shows
2068 * "(no parent image)".
2069 */
2070static ssize_t rbd_parent_show(struct device *dev,
2071 struct device_attribute *attr,
2072 char *buf)
2073{
2074 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2075 struct rbd_spec *spec = rbd_dev->parent_spec;
2076 int count;
2077 char *bufp = buf;
2078
2079 if (!spec)
2080 return sprintf(buf, "(no parent image)\n");
2081
2082 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2083 (unsigned long long) spec->pool_id, spec->pool_name);
2084 if (count < 0)
2085 return count;
2086 bufp += count;
2087
2088 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2089 spec->image_name ? spec->image_name : "(unknown)");
2090 if (count < 0)
2091 return count;
2092 bufp += count;
2093
2094 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2095 (unsigned long long) spec->snap_id, spec->snap_name);
2096 if (count < 0)
2097 return count;
2098 bufp += count;
2099
2100 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2101 if (count < 0)
2102 return count;
2103 bufp += count;
2104
2105 return (ssize_t) (bufp - buf);
2106}
2107
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002108static ssize_t rbd_image_refresh(struct device *dev,
2109 struct device_attribute *attr,
2110 const char *buf,
2111 size_t size)
2112{
Alex Elder593a9e72012-02-07 12:03:37 -06002113 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002114 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002115
Alex Elder117973f2012-08-31 17:29:55 -05002116 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002117
2118 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002120
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002121static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002122static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002123static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2124static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2125static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002126static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002127static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002128static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002129static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2130static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002131static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002132
2133static struct attribute *rbd_attrs[] = {
2134 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002135 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002136 &dev_attr_major.attr,
2137 &dev_attr_client_id.attr,
2138 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002139 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002140 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002141 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002142 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002143 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002144 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002145 NULL
2146};
2147
2148static struct attribute_group rbd_attr_group = {
2149 .attrs = rbd_attrs,
2150};
2151
2152static const struct attribute_group *rbd_attr_groups[] = {
2153 &rbd_attr_group,
2154 NULL
2155};
2156
2157static void rbd_sysfs_dev_release(struct device *dev)
2158{
2159}
2160
2161static struct device_type rbd_device_type = {
2162 .name = "rbd",
2163 .groups = rbd_attr_groups,
2164 .release = rbd_sysfs_dev_release,
2165};
2166
2167
2168/*
2169 sysfs - snapshots
2170*/
2171
2172static ssize_t rbd_snap_size_show(struct device *dev,
2173 struct device_attribute *attr,
2174 char *buf)
2175{
2176 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2177
Josh Durgin35915382011-12-05 18:25:13 -08002178 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002179}
2180
2181static ssize_t rbd_snap_id_show(struct device *dev,
2182 struct device_attribute *attr,
2183 char *buf)
2184{
2185 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2186
Josh Durgin35915382011-12-05 18:25:13 -08002187 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002188}
2189
Alex Elder34b13182012-07-13 20:35:12 -05002190static ssize_t rbd_snap_features_show(struct device *dev,
2191 struct device_attribute *attr,
2192 char *buf)
2193{
2194 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2195
2196 return sprintf(buf, "0x%016llx\n",
2197 (unsigned long long) snap->features);
2198}
2199
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002200static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2201static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002202static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002203
2204static struct attribute *rbd_snap_attrs[] = {
2205 &dev_attr_snap_size.attr,
2206 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002207 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002208 NULL,
2209};
2210
2211static struct attribute_group rbd_snap_attr_group = {
2212 .attrs = rbd_snap_attrs,
2213};
2214
2215static void rbd_snap_dev_release(struct device *dev)
2216{
2217 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2218 kfree(snap->name);
2219 kfree(snap);
2220}
2221
2222static const struct attribute_group *rbd_snap_attr_groups[] = {
2223 &rbd_snap_attr_group,
2224 NULL
2225};
2226
2227static struct device_type rbd_snap_device_type = {
2228 .groups = rbd_snap_attr_groups,
2229 .release = rbd_snap_dev_release,
2230};
2231
Alex Elder8b8fb992012-10-26 17:25:24 -05002232static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2233{
2234 kref_get(&spec->kref);
2235
2236 return spec;
2237}
2238
2239static void rbd_spec_free(struct kref *kref);
2240static void rbd_spec_put(struct rbd_spec *spec)
2241{
2242 if (spec)
2243 kref_put(&spec->kref, rbd_spec_free);
2244}
2245
2246static struct rbd_spec *rbd_spec_alloc(void)
2247{
2248 struct rbd_spec *spec;
2249
2250 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2251 if (!spec)
2252 return NULL;
2253 kref_init(&spec->kref);
2254
2255 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2256
2257 return spec;
2258}
2259
2260static void rbd_spec_free(struct kref *kref)
2261{
2262 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2263
2264 kfree(spec->pool_name);
2265 kfree(spec->image_id);
2266 kfree(spec->image_name);
2267 kfree(spec->snap_name);
2268 kfree(spec);
2269}
2270
Alex Elderc53d5892012-10-25 23:34:42 -05002271struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2272 struct rbd_spec *spec)
2273{
2274 struct rbd_device *rbd_dev;
2275
2276 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2277 if (!rbd_dev)
2278 return NULL;
2279
2280 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002281 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002282 INIT_LIST_HEAD(&rbd_dev->node);
2283 INIT_LIST_HEAD(&rbd_dev->snaps);
2284 init_rwsem(&rbd_dev->header_rwsem);
2285
2286 rbd_dev->spec = spec;
2287 rbd_dev->rbd_client = rbdc;
2288
Alex Elder0903e872012-11-14 12:25:19 -06002289 /* Initialize the layout used for all rbd requests */
2290
2291 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2292 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2293 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2294 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2295
Alex Elderc53d5892012-10-25 23:34:42 -05002296 return rbd_dev;
2297}
2298
2299static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2300{
Alex Elder86b00e02012-10-25 23:34:42 -05002301 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002302 kfree(rbd_dev->header_name);
2303 rbd_put_client(rbd_dev->rbd_client);
2304 rbd_spec_put(rbd_dev->spec);
2305 kfree(rbd_dev);
2306}
2307
Alex Elder304f6802012-08-31 17:29:52 -05002308static bool rbd_snap_registered(struct rbd_snap *snap)
2309{
2310 bool ret = snap->dev.type == &rbd_snap_device_type;
2311 bool reg = device_is_registered(&snap->dev);
2312
2313 rbd_assert(!ret ^ reg);
2314
2315 return ret;
2316}
2317
Alex Elder41f38c22012-10-25 23:34:40 -05002318static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002319{
2320 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002321 if (device_is_registered(&snap->dev))
2322 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002323}
2324
Alex Elder14e70852012-07-19 09:09:27 -05002325static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002326 struct device *parent)
2327{
2328 struct device *dev = &snap->dev;
2329 int ret;
2330
2331 dev->type = &rbd_snap_device_type;
2332 dev->parent = parent;
2333 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002334 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002335 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2336
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002337 ret = device_register(dev);
2338
2339 return ret;
2340}
2341
Alex Elder4e891e02012-07-10 20:30:10 -05002342static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002343 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002344 u64 snap_id, u64 snap_size,
2345 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002346{
Alex Elder4e891e02012-07-10 20:30:10 -05002347 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002348 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002349
2350 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002351 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002352 return ERR_PTR(-ENOMEM);
2353
2354 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002355 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002356 if (!snap->name)
2357 goto err;
2358
Alex Elderc8d18422012-07-10 20:30:11 -05002359 snap->id = snap_id;
2360 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002361 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002362
2363 return snap;
2364
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002365err:
2366 kfree(snap->name);
2367 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002368
2369 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002370}
2371
Alex Eldercd892122012-07-03 16:01:19 -05002372static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2373 u64 *snap_size, u64 *snap_features)
2374{
2375 char *snap_name;
2376
2377 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2378
2379 *snap_size = rbd_dev->header.snap_sizes[which];
2380 *snap_features = 0; /* No features for v1 */
2381
2382 /* Skip over names until we find the one we are looking for */
2383
2384 snap_name = rbd_dev->header.snap_names;
2385 while (which--)
2386 snap_name += strlen(snap_name) + 1;
2387
2388 return snap_name;
2389}
2390
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002391/*
Alex Elder9d475de2012-07-03 16:01:19 -05002392 * Get the size and object order for an image snapshot, or if
2393 * snap_id is CEPH_NOSNAP, gets this information for the base
2394 * image.
2395 */
2396static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2397 u8 *order, u64 *snap_size)
2398{
2399 __le64 snapid = cpu_to_le64(snap_id);
2400 int ret;
2401 struct {
2402 u8 order;
2403 __le64 size;
2404 } __attribute__ ((packed)) size_buf = { 0 };
2405
2406 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2407 "rbd", "get_size",
2408 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002409 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder9d475de2012-07-03 16:01:19 -05002410 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2411 if (ret < 0)
2412 return ret;
2413
2414 *order = size_buf.order;
2415 *snap_size = le64_to_cpu(size_buf.size);
2416
2417 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2418 (unsigned long long) snap_id, (unsigned int) *order,
2419 (unsigned long long) *snap_size);
2420
2421 return 0;
2422}
2423
2424static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2425{
2426 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2427 &rbd_dev->header.obj_order,
2428 &rbd_dev->header.image_size);
2429}
2430
Alex Elder1e130192012-07-03 16:01:19 -05002431static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2432{
2433 void *reply_buf;
2434 int ret;
2435 void *p;
2436
2437 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2438 if (!reply_buf)
2439 return -ENOMEM;
2440
2441 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2442 "rbd", "get_object_prefix",
2443 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002444 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder1e130192012-07-03 16:01:19 -05002445 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2446 if (ret < 0)
2447 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002448 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002449
2450 p = reply_buf;
2451 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2452 p + RBD_OBJ_PREFIX_LEN_MAX,
2453 NULL, GFP_NOIO);
2454
2455 if (IS_ERR(rbd_dev->header.object_prefix)) {
2456 ret = PTR_ERR(rbd_dev->header.object_prefix);
2457 rbd_dev->header.object_prefix = NULL;
2458 } else {
2459 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2460 }
2461
2462out:
2463 kfree(reply_buf);
2464
2465 return ret;
2466}
2467
Alex Elderb1b54022012-07-03 16:01:19 -05002468static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2469 u64 *snap_features)
2470{
2471 __le64 snapid = cpu_to_le64(snap_id);
2472 struct {
2473 __le64 features;
2474 __le64 incompat;
2475 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002476 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002477 int ret;
2478
2479 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2480 "rbd", "get_features",
2481 (char *) &snapid, sizeof (snapid),
2482 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002483 NULL);
Alex Elderb1b54022012-07-03 16:01:19 -05002484 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2485 if (ret < 0)
2486 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002487
2488 incompat = le64_to_cpu(features_buf.incompat);
2489 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002490 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002491
Alex Elderb1b54022012-07-03 16:01:19 -05002492 *snap_features = le64_to_cpu(features_buf.features);
2493
2494 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2495 (unsigned long long) snap_id,
2496 (unsigned long long) *snap_features,
2497 (unsigned long long) le64_to_cpu(features_buf.incompat));
2498
2499 return 0;
2500}
2501
2502static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2503{
2504 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2505 &rbd_dev->header.features);
2506}
2507
Alex Elder86b00e02012-10-25 23:34:42 -05002508static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2509{
2510 struct rbd_spec *parent_spec;
2511 size_t size;
2512 void *reply_buf = NULL;
2513 __le64 snapid;
2514 void *p;
2515 void *end;
2516 char *image_id;
2517 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002518 int ret;
2519
2520 parent_spec = rbd_spec_alloc();
2521 if (!parent_spec)
2522 return -ENOMEM;
2523
2524 size = sizeof (__le64) + /* pool_id */
2525 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2526 sizeof (__le64) + /* snap_id */
2527 sizeof (__le64); /* overlap */
2528 reply_buf = kmalloc(size, GFP_KERNEL);
2529 if (!reply_buf) {
2530 ret = -ENOMEM;
2531 goto out_err;
2532 }
2533
2534 snapid = cpu_to_le64(CEPH_NOSNAP);
2535 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2536 "rbd", "get_parent",
2537 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002538 (char *) reply_buf, size, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002539 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2540 if (ret < 0)
2541 goto out_err;
2542
2543 ret = -ERANGE;
2544 p = reply_buf;
2545 end = (char *) reply_buf + size;
2546 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2547 if (parent_spec->pool_id == CEPH_NOPOOL)
2548 goto out; /* No parent? No problem. */
2549
Alex Elder0903e872012-11-14 12:25:19 -06002550 /* The ceph file layout needs to fit pool id in 32 bits */
2551
2552 ret = -EIO;
2553 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2554 goto out;
2555
Alex Elder979ed482012-11-01 08:39:26 -05002556 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002557 if (IS_ERR(image_id)) {
2558 ret = PTR_ERR(image_id);
2559 goto out_err;
2560 }
2561 parent_spec->image_id = image_id;
2562 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2563 ceph_decode_64_safe(&p, end, overlap, out_err);
2564
2565 rbd_dev->parent_overlap = overlap;
2566 rbd_dev->parent_spec = parent_spec;
2567 parent_spec = NULL; /* rbd_dev now owns this */
2568out:
2569 ret = 0;
2570out_err:
2571 kfree(reply_buf);
2572 rbd_spec_put(parent_spec);
2573
2574 return ret;
2575}
2576
Alex Elder9e15b772012-10-30 19:40:33 -05002577static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2578{
2579 size_t image_id_size;
2580 char *image_id;
2581 void *p;
2582 void *end;
2583 size_t size;
2584 void *reply_buf = NULL;
2585 size_t len = 0;
2586 char *image_name = NULL;
2587 int ret;
2588
2589 rbd_assert(!rbd_dev->spec->image_name);
2590
Alex Elder69e7a022012-11-01 08:39:26 -05002591 len = strlen(rbd_dev->spec->image_id);
2592 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002593 image_id = kmalloc(image_id_size, GFP_KERNEL);
2594 if (!image_id)
2595 return NULL;
2596
2597 p = image_id;
2598 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002599 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002600
2601 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2602 reply_buf = kmalloc(size, GFP_KERNEL);
2603 if (!reply_buf)
2604 goto out;
2605
2606 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2607 "rbd", "dir_get_name",
2608 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06002609 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05002610 if (ret < 0)
2611 goto out;
2612 p = reply_buf;
2613 end = (char *) reply_buf + size;
2614 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2615 if (IS_ERR(image_name))
2616 image_name = NULL;
2617 else
2618 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2619out:
2620 kfree(reply_buf);
2621 kfree(image_id);
2622
2623 return image_name;
2624}
2625
2626/*
2627 * When a parent image gets probed, we only have the pool, image,
2628 * and snapshot ids but not the names of any of them. This call
2629 * is made later to fill in those names. It has to be done after
2630 * rbd_dev_snaps_update() has completed because some of the
2631 * information (in particular, snapshot name) is not available
2632 * until then.
2633 */
2634static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2635{
2636 struct ceph_osd_client *osdc;
2637 const char *name;
2638 void *reply_buf = NULL;
2639 int ret;
2640
2641 if (rbd_dev->spec->pool_name)
2642 return 0; /* Already have the names */
2643
2644 /* Look up the pool name */
2645
2646 osdc = &rbd_dev->rbd_client->client->osdc;
2647 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05002648 if (!name) {
2649 rbd_warn(rbd_dev, "there is no pool with id %llu",
2650 rbd_dev->spec->pool_id); /* Really a BUG() */
2651 return -EIO;
2652 }
Alex Elder9e15b772012-10-30 19:40:33 -05002653
2654 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2655 if (!rbd_dev->spec->pool_name)
2656 return -ENOMEM;
2657
2658 /* Fetch the image name; tolerate failure here */
2659
2660 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05002661 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05002662 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05002663 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05002664 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05002665
2666 /* Look up the snapshot name. */
2667
2668 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2669 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05002670 rbd_warn(rbd_dev, "no snapshot with id %llu",
2671 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05002672 ret = -EIO;
2673 goto out_err;
2674 }
2675 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2676 if(!rbd_dev->spec->snap_name)
2677 goto out_err;
2678
2679 return 0;
2680out_err:
2681 kfree(reply_buf);
2682 kfree(rbd_dev->spec->pool_name);
2683 rbd_dev->spec->pool_name = NULL;
2684
2685 return ret;
2686}
2687
Alex Elder6e14b1a2012-07-03 16:01:19 -05002688static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002689{
2690 size_t size;
2691 int ret;
2692 void *reply_buf;
2693 void *p;
2694 void *end;
2695 u64 seq;
2696 u32 snap_count;
2697 struct ceph_snap_context *snapc;
2698 u32 i;
2699
2700 /*
2701 * We'll need room for the seq value (maximum snapshot id),
2702 * snapshot count, and array of that many snapshot ids.
2703 * For now we have a fixed upper limit on the number we're
2704 * prepared to receive.
2705 */
2706 size = sizeof (__le64) + sizeof (__le32) +
2707 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2708 reply_buf = kzalloc(size, GFP_KERNEL);
2709 if (!reply_buf)
2710 return -ENOMEM;
2711
2712 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2713 "rbd", "get_snapcontext",
2714 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002715 reply_buf, size, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002716 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2717 if (ret < 0)
2718 goto out;
2719
2720 ret = -ERANGE;
2721 p = reply_buf;
2722 end = (char *) reply_buf + size;
2723 ceph_decode_64_safe(&p, end, seq, out);
2724 ceph_decode_32_safe(&p, end, snap_count, out);
2725
2726 /*
2727 * Make sure the reported number of snapshot ids wouldn't go
2728 * beyond the end of our buffer. But before checking that,
2729 * make sure the computed size of the snapshot context we
2730 * allocate is representable in a size_t.
2731 */
2732 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2733 / sizeof (u64)) {
2734 ret = -EINVAL;
2735 goto out;
2736 }
2737 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2738 goto out;
2739
2740 size = sizeof (struct ceph_snap_context) +
2741 snap_count * sizeof (snapc->snaps[0]);
2742 snapc = kmalloc(size, GFP_KERNEL);
2743 if (!snapc) {
2744 ret = -ENOMEM;
2745 goto out;
2746 }
2747
2748 atomic_set(&snapc->nref, 1);
2749 snapc->seq = seq;
2750 snapc->num_snaps = snap_count;
2751 for (i = 0; i < snap_count; i++)
2752 snapc->snaps[i] = ceph_decode_64(&p);
2753
2754 rbd_dev->header.snapc = snapc;
2755
2756 dout(" snap context seq = %llu, snap_count = %u\n",
2757 (unsigned long long) seq, (unsigned int) snap_count);
2758
2759out:
2760 kfree(reply_buf);
2761
2762 return 0;
2763}
2764
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002765static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2766{
2767 size_t size;
2768 void *reply_buf;
2769 __le64 snap_id;
2770 int ret;
2771 void *p;
2772 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002773 char *snap_name;
2774
2775 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2776 reply_buf = kmalloc(size, GFP_KERNEL);
2777 if (!reply_buf)
2778 return ERR_PTR(-ENOMEM);
2779
2780 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2781 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2782 "rbd", "get_snapshot_name",
2783 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06002784 reply_buf, size, NULL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002785 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2786 if (ret < 0)
2787 goto out;
2788
2789 p = reply_buf;
2790 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002791 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002792 if (IS_ERR(snap_name)) {
2793 ret = PTR_ERR(snap_name);
2794 goto out;
2795 } else {
2796 dout(" snap_id 0x%016llx snap_name = %s\n",
2797 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2798 }
2799 kfree(reply_buf);
2800
2801 return snap_name;
2802out:
2803 kfree(reply_buf);
2804
2805 return ERR_PTR(ret);
2806}
2807
2808static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2809 u64 *snap_size, u64 *snap_features)
2810{
2811 __le64 snap_id;
2812 u8 order;
2813 int ret;
2814
2815 snap_id = rbd_dev->header.snapc->snaps[which];
2816 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2817 if (ret)
2818 return ERR_PTR(ret);
2819 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2820 if (ret)
2821 return ERR_PTR(ret);
2822
2823 return rbd_dev_v2_snap_name(rbd_dev, which);
2824}
2825
2826static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2827 u64 *snap_size, u64 *snap_features)
2828{
2829 if (rbd_dev->image_format == 1)
2830 return rbd_dev_v1_snap_info(rbd_dev, which,
2831 snap_size, snap_features);
2832 if (rbd_dev->image_format == 2)
2833 return rbd_dev_v2_snap_info(rbd_dev, which,
2834 snap_size, snap_features);
2835 return ERR_PTR(-EINVAL);
2836}
2837
Alex Elder117973f2012-08-31 17:29:55 -05002838static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2839{
2840 int ret;
2841 __u8 obj_order;
2842
2843 down_write(&rbd_dev->header_rwsem);
2844
2845 /* Grab old order first, to see if it changes */
2846
2847 obj_order = rbd_dev->header.obj_order,
2848 ret = rbd_dev_v2_image_size(rbd_dev);
2849 if (ret)
2850 goto out;
2851 if (rbd_dev->header.obj_order != obj_order) {
2852 ret = -EIO;
2853 goto out;
2854 }
2855 rbd_update_mapping_size(rbd_dev);
2856
2857 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2858 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2859 if (ret)
2860 goto out;
2861 ret = rbd_dev_snaps_update(rbd_dev);
2862 dout("rbd_dev_snaps_update returned %d\n", ret);
2863 if (ret)
2864 goto out;
2865 ret = rbd_dev_snaps_register(rbd_dev);
2866 dout("rbd_dev_snaps_register returned %d\n", ret);
2867out:
2868 up_write(&rbd_dev->header_rwsem);
2869
2870 return ret;
2871}
2872
Alex Elder9d475de2012-07-03 16:01:19 -05002873/*
Alex Elder35938152012-08-02 11:29:46 -05002874 * Scan the rbd device's current snapshot list and compare it to the
2875 * newly-received snapshot context. Remove any existing snapshots
2876 * not present in the new snapshot context. Add a new snapshot for
2877 * any snaphots in the snapshot context not in the current list.
2878 * And verify there are no changes to snapshots we already know
2879 * about.
2880 *
2881 * Assumes the snapshots in the snapshot context are sorted by
2882 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2883 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002884 */
Alex Elder304f6802012-08-31 17:29:52 -05002885static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002886{
Alex Elder35938152012-08-02 11:29:46 -05002887 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2888 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002889 struct list_head *head = &rbd_dev->snaps;
2890 struct list_head *links = head->next;
2891 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002892
Alex Elder9fcbb802012-08-23 23:48:49 -05002893 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002894 while (index < snap_count || links != head) {
2895 u64 snap_id;
2896 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002897 char *snap_name;
2898 u64 snap_size = 0;
2899 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002900
Alex Elder35938152012-08-02 11:29:46 -05002901 snap_id = index < snap_count ? snapc->snaps[index]
2902 : CEPH_NOSNAP;
2903 snap = links != head ? list_entry(links, struct rbd_snap, node)
2904 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05002905 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002906
Alex Elder35938152012-08-02 11:29:46 -05002907 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2908 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002909
Alex Elder35938152012-08-02 11:29:46 -05002910 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002911
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002912 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06002913 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05002914 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002915 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002916 rbd_dev->spec->snap_id == snap->id ?
2917 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002918 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002919
Alex Elder35938152012-08-02 11:29:46 -05002920 /* Done with this list entry; advance */
2921
2922 links = next;
2923 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002924 }
Alex Elder35938152012-08-02 11:29:46 -05002925
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002926 snap_name = rbd_dev_snap_info(rbd_dev, index,
2927 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002928 if (IS_ERR(snap_name))
2929 return PTR_ERR(snap_name);
2930
Alex Elder9fcbb802012-08-23 23:48:49 -05002931 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2932 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002933 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2934 struct rbd_snap *new_snap;
2935
2936 /* We haven't seen this snapshot before */
2937
Alex Elderc8d18422012-07-10 20:30:11 -05002938 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002939 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002940 if (IS_ERR(new_snap)) {
2941 int err = PTR_ERR(new_snap);
2942
2943 dout(" failed to add dev, error %d\n", err);
2944
2945 return err;
2946 }
Alex Elder35938152012-08-02 11:29:46 -05002947
2948 /* New goes before existing, or at end of list */
2949
Alex Elder9fcbb802012-08-23 23:48:49 -05002950 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002951 if (snap)
2952 list_add_tail(&new_snap->node, &snap->node);
2953 else
Alex Elder523f3252012-08-30 00:16:37 -05002954 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002955 } else {
2956 /* Already have this one */
2957
Alex Elder9fcbb802012-08-23 23:48:49 -05002958 dout(" already present\n");
2959
Alex Eldercd892122012-07-03 16:01:19 -05002960 rbd_assert(snap->size == snap_size);
Alex Elderaafb230e2012-09-06 16:00:54 -05002961 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002962 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002963
2964 /* Done with this list entry; advance */
2965
2966 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002967 }
Alex Elder35938152012-08-02 11:29:46 -05002968
2969 /* Advance to the next entry in the snapshot context */
2970
2971 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002972 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002973 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002974
2975 return 0;
2976}
2977
Alex Elder304f6802012-08-31 17:29:52 -05002978/*
2979 * Scan the list of snapshots and register the devices for any that
2980 * have not already been registered.
2981 */
2982static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2983{
2984 struct rbd_snap *snap;
2985 int ret = 0;
2986
2987 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002988 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2989 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002990
2991 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2992 if (!rbd_snap_registered(snap)) {
2993 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2994 if (ret < 0)
2995 break;
2996 }
2997 }
2998 dout("%s: returning %d\n", __func__, ret);
2999
3000 return ret;
3001}
3002
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003003static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3004{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003005 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003006 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003007
3008 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003009
Alex Eldercd789ab2012-08-30 00:16:38 -05003010 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003011 dev->bus = &rbd_bus_type;
3012 dev->type = &rbd_device_type;
3013 dev->parent = &rbd_root_dev;
3014 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003015 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003016 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003017
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003018 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003019
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003020 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003021}
3022
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003023static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3024{
3025 device_unregister(&rbd_dev->dev);
3026}
3027
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003028static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
3029{
3030 int ret, rc;
3031
3032 do {
Alex Elder907703d2012-11-13 21:11:15 -06003033 ret = rbd_req_sync_watch(rbd_dev, 1);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003034 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05003035 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003036 if (rc < 0)
3037 return rc;
3038 }
3039 } while (ret == -ERANGE);
3040
3041 return ret;
3042}
3043
Alex Eldere2839302012-08-29 17:11:06 -05003044static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003045
3046/*
Alex Elder499afd52012-02-02 08:13:29 -06003047 * Get a unique rbd identifier for the given new rbd_dev, and add
3048 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003049 */
Alex Eldere2839302012-08-29 17:11:06 -05003050static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003051{
Alex Eldere2839302012-08-29 17:11:06 -05003052 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003053
3054 spin_lock(&rbd_dev_list_lock);
3055 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3056 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003057 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3058 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003059}
Alex Elderb7f23c32012-01-29 13:57:43 -06003060
Alex Elder1ddbe942012-01-29 13:57:44 -06003061/*
Alex Elder499afd52012-02-02 08:13:29 -06003062 * Remove an rbd_dev from the global list, and record that its
3063 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003064 */
Alex Eldere2839302012-08-29 17:11:06 -05003065static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003066{
Alex Elderd184f6b2012-01-29 13:57:44 -06003067 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003068 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003069 int max_id;
3070
Alex Elderaafb230e2012-09-06 16:00:54 -05003071 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003072
Alex Eldere2839302012-08-29 17:11:06 -05003073 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3074 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003075 spin_lock(&rbd_dev_list_lock);
3076 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003077
3078 /*
3079 * If the id being "put" is not the current maximum, there
3080 * is nothing special we need to do.
3081 */
Alex Eldere2839302012-08-29 17:11:06 -05003082 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003083 spin_unlock(&rbd_dev_list_lock);
3084 return;
3085 }
3086
3087 /*
3088 * We need to update the current maximum id. Search the
3089 * list to find out what it is. We're more likely to find
3090 * the maximum at the end, so search the list backward.
3091 */
3092 max_id = 0;
3093 list_for_each_prev(tmp, &rbd_dev_list) {
3094 struct rbd_device *rbd_dev;
3095
3096 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003097 if (rbd_dev->dev_id > max_id)
3098 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003099 }
Alex Elder499afd52012-02-02 08:13:29 -06003100 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003101
Alex Elder1ddbe942012-01-29 13:57:44 -06003102 /*
Alex Eldere2839302012-08-29 17:11:06 -05003103 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003104 * which case it now accurately reflects the new maximum.
3105 * Be careful not to overwrite the maximum value in that
3106 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003107 */
Alex Eldere2839302012-08-29 17:11:06 -05003108 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3109 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003110}
3111
Alex Eldera725f65e2012-02-02 08:13:30 -06003112/*
Alex Eldere28fff262012-02-02 08:13:30 -06003113 * Skips over white space at *buf, and updates *buf to point to the
3114 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003115 * the token (string of non-white space characters) found. Note
3116 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003117 */
3118static inline size_t next_token(const char **buf)
3119{
3120 /*
3121 * These are the characters that produce nonzero for
3122 * isspace() in the "C" and "POSIX" locales.
3123 */
3124 const char *spaces = " \f\n\r\t\v";
3125
3126 *buf += strspn(*buf, spaces); /* Find start of token */
3127
3128 return strcspn(*buf, spaces); /* Return token length */
3129}
3130
3131/*
3132 * Finds the next token in *buf, and if the provided token buffer is
3133 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003134 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3135 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003136 *
3137 * Returns the length of the token found (not including the '\0').
3138 * Return value will be 0 if no token is found, and it will be >=
3139 * token_size if the token would not fit.
3140 *
Alex Elder593a9e72012-02-07 12:03:37 -06003141 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003142 * found token. Note that this occurs even if the token buffer is
3143 * too small to hold it.
3144 */
3145static inline size_t copy_token(const char **buf,
3146 char *token,
3147 size_t token_size)
3148{
3149 size_t len;
3150
3151 len = next_token(buf);
3152 if (len < token_size) {
3153 memcpy(token, *buf, len);
3154 *(token + len) = '\0';
3155 }
3156 *buf += len;
3157
3158 return len;
3159}
3160
3161/*
Alex Elderea3352f2012-07-09 21:04:23 -05003162 * Finds the next token in *buf, dynamically allocates a buffer big
3163 * enough to hold a copy of it, and copies the token into the new
3164 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3165 * that a duplicate buffer is created even for a zero-length token.
3166 *
3167 * Returns a pointer to the newly-allocated duplicate, or a null
3168 * pointer if memory for the duplicate was not available. If
3169 * the lenp argument is a non-null pointer, the length of the token
3170 * (not including the '\0') is returned in *lenp.
3171 *
3172 * If successful, the *buf pointer will be updated to point beyond
3173 * the end of the found token.
3174 *
3175 * Note: uses GFP_KERNEL for allocation.
3176 */
3177static inline char *dup_token(const char **buf, size_t *lenp)
3178{
3179 char *dup;
3180 size_t len;
3181
3182 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003183 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003184 if (!dup)
3185 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003186 *(dup + len) = '\0';
3187 *buf += len;
3188
3189 if (lenp)
3190 *lenp = len;
3191
3192 return dup;
3193}
3194
3195/*
Alex Elder859c31d2012-10-25 23:34:42 -05003196 * Parse the options provided for an "rbd add" (i.e., rbd image
3197 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3198 * and the data written is passed here via a NUL-terminated buffer.
3199 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003200 *
Alex Elder859c31d2012-10-25 23:34:42 -05003201 * The information extracted from these options is recorded in
3202 * the other parameters which return dynamically-allocated
3203 * structures:
3204 * ceph_opts
3205 * The address of a pointer that will refer to a ceph options
3206 * structure. Caller must release the returned pointer using
3207 * ceph_destroy_options() when it is no longer needed.
3208 * rbd_opts
3209 * Address of an rbd options pointer. Fully initialized by
3210 * this function; caller must release with kfree().
3211 * spec
3212 * Address of an rbd image specification pointer. Fully
3213 * initialized by this function based on parsed options.
3214 * Caller must release with rbd_spec_put().
3215 *
3216 * The options passed take this form:
3217 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3218 * where:
3219 * <mon_addrs>
3220 * A comma-separated list of one or more monitor addresses.
3221 * A monitor address is an ip address, optionally followed
3222 * by a port number (separated by a colon).
3223 * I.e.: ip1[:port1][,ip2[:port2]...]
3224 * <options>
3225 * A comma-separated list of ceph and/or rbd options.
3226 * <pool_name>
3227 * The name of the rados pool containing the rbd image.
3228 * <image_name>
3229 * The name of the image in that pool to map.
3230 * <snap_id>
3231 * An optional snapshot id. If provided, the mapping will
3232 * present data from the image at the time that snapshot was
3233 * created. The image head is used if no snapshot id is
3234 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003235 */
Alex Elder859c31d2012-10-25 23:34:42 -05003236static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003237 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003238 struct rbd_options **opts,
3239 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003240{
Alex Elderd22f76e2012-07-12 10:46:35 -05003241 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003242 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003243 const char *mon_addrs;
3244 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003245 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003246 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003247 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003248 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003249
3250 /* The first four tokens are required */
3251
Alex Elder7ef32142012-02-02 08:13:30 -06003252 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05003253 if (!len) {
3254 rbd_warn(NULL, "no monitor address(es) provided");
3255 return -EINVAL;
3256 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003257 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003258 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003259 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003260
Alex Elderdc79b112012-10-25 23:34:41 -05003261 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003262 options = dup_token(&buf, NULL);
3263 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003264 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003265 if (!*options) {
3266 rbd_warn(NULL, "no options provided");
3267 goto out_err;
3268 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003269
Alex Elder859c31d2012-10-25 23:34:42 -05003270 spec = rbd_spec_alloc();
3271 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003272 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003273
3274 spec->pool_name = dup_token(&buf, NULL);
3275 if (!spec->pool_name)
3276 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003277 if (!*spec->pool_name) {
3278 rbd_warn(NULL, "no pool name provided");
3279 goto out_err;
3280 }
Alex Eldere28fff262012-02-02 08:13:30 -06003281
Alex Elder69e7a022012-11-01 08:39:26 -05003282 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003283 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003284 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003285 if (!*spec->image_name) {
3286 rbd_warn(NULL, "no image name provided");
3287 goto out_err;
3288 }
Alex Eldere28fff262012-02-02 08:13:30 -06003289
Alex Elderf28e5652012-10-25 23:34:41 -05003290 /*
3291 * Snapshot name is optional; default is to use "-"
3292 * (indicating the head/no snapshot).
3293 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003294 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003295 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003296 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3297 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003298 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003299 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003300 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003301 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003302 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003303 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003304 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003305 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003306
Alex Elder0ddebc02012-10-25 23:34:41 -05003307 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003308
Alex Elder4e9afeb2012-10-25 23:34:41 -05003309 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3310 if (!rbd_opts)
3311 goto out_mem;
3312
3313 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003314
Alex Elder859c31d2012-10-25 23:34:42 -05003315 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003316 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003317 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003318 if (IS_ERR(copts)) {
3319 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003320 goto out_err;
3321 }
Alex Elder859c31d2012-10-25 23:34:42 -05003322 kfree(options);
3323
3324 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003325 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003326 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003327
Alex Elderdc79b112012-10-25 23:34:41 -05003328 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003329out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003330 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003331out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003332 kfree(rbd_opts);
3333 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003334 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003335
Alex Elderdc79b112012-10-25 23:34:41 -05003336 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003337}
3338
Alex Elder589d30e2012-07-10 20:30:11 -05003339/*
3340 * An rbd format 2 image has a unique identifier, distinct from the
3341 * name given to it by the user. Internally, that identifier is
3342 * what's used to specify the names of objects related to the image.
3343 *
3344 * A special "rbd id" object is used to map an rbd image name to its
3345 * id. If that object doesn't exist, then there is no v2 rbd image
3346 * with the supplied name.
3347 *
3348 * This function will record the given rbd_dev's image_id field if
3349 * it can be determined, and in that case will return 0. If any
3350 * errors occur a negative errno will be returned and the rbd_dev's
3351 * image_id field will be unchanged (and should be NULL).
3352 */
3353static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3354{
3355 int ret;
3356 size_t size;
3357 char *object_name;
3358 void *response;
3359 void *p;
3360
3361 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003362 * When probing a parent image, the image id is already
3363 * known (and the image name likely is not). There's no
3364 * need to fetch the image id again in this case.
3365 */
3366 if (rbd_dev->spec->image_id)
3367 return 0;
3368
3369 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003370 * First, see if the format 2 image id file exists, and if
3371 * so, get the image's persistent id from it.
3372 */
Alex Elder69e7a022012-11-01 08:39:26 -05003373 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003374 object_name = kmalloc(size, GFP_NOIO);
3375 if (!object_name)
3376 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003377 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003378 dout("rbd id object name is %s\n", object_name);
3379
3380 /* Response will be an encoded string, which includes a length */
3381
3382 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3383 response = kzalloc(size, GFP_NOIO);
3384 if (!response) {
3385 ret = -ENOMEM;
3386 goto out;
3387 }
3388
3389 ret = rbd_req_sync_exec(rbd_dev, object_name,
3390 "rbd", "get_id",
3391 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003392 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003393 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3394 if (ret < 0)
3395 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003396 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003397
3398 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003399 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003400 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003401 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003402 if (IS_ERR(rbd_dev->spec->image_id)) {
3403 ret = PTR_ERR(rbd_dev->spec->image_id);
3404 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003405 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003406 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003407 }
3408out:
3409 kfree(response);
3410 kfree(object_name);
3411
3412 return ret;
3413}
3414
Alex Eldera30b71b2012-07-10 20:30:11 -05003415static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3416{
3417 int ret;
3418 size_t size;
3419
3420 /* Version 1 images have no id; empty string is used */
3421
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003422 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3423 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003424 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003425
3426 /* Record the header object name for this rbd image. */
3427
Alex Elder69e7a022012-11-01 08:39:26 -05003428 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003429 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3430 if (!rbd_dev->header_name) {
3431 ret = -ENOMEM;
3432 goto out_err;
3433 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003434 sprintf(rbd_dev->header_name, "%s%s",
3435 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003436
3437 /* Populate rbd image metadata */
3438
3439 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3440 if (ret < 0)
3441 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003442
3443 /* Version 1 images have no parent (no layering) */
3444
3445 rbd_dev->parent_spec = NULL;
3446 rbd_dev->parent_overlap = 0;
3447
Alex Eldera30b71b2012-07-10 20:30:11 -05003448 rbd_dev->image_format = 1;
3449
3450 dout("discovered version 1 image, header name is %s\n",
3451 rbd_dev->header_name);
3452
3453 return 0;
3454
3455out_err:
3456 kfree(rbd_dev->header_name);
3457 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003458 kfree(rbd_dev->spec->image_id);
3459 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003460
3461 return ret;
3462}
3463
3464static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3465{
3466 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003467 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003468 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003469
3470 /*
3471 * Image id was filled in by the caller. Record the header
3472 * object name for this rbd image.
3473 */
Alex Elder979ed482012-11-01 08:39:26 -05003474 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003475 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3476 if (!rbd_dev->header_name)
3477 return -ENOMEM;
3478 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003479 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003480
3481 /* Get the size and object order for the image */
3482
3483 ret = rbd_dev_v2_image_size(rbd_dev);
3484 if (ret < 0)
3485 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003486
3487 /* Get the object prefix (a.k.a. block_name) for the image */
3488
3489 ret = rbd_dev_v2_object_prefix(rbd_dev);
3490 if (ret < 0)
3491 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003492
Alex Elderd8891402012-10-09 13:50:17 -07003493 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003494
3495 ret = rbd_dev_v2_features(rbd_dev);
3496 if (ret < 0)
3497 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003498
Alex Elder86b00e02012-10-25 23:34:42 -05003499 /* If the image supports layering, get the parent info */
3500
3501 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3502 ret = rbd_dev_v2_parent_info(rbd_dev);
3503 if (ret < 0)
3504 goto out_err;
3505 }
3506
Alex Elder6e14b1a2012-07-03 16:01:19 -05003507 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003508
Alex Elder6e14b1a2012-07-03 16:01:19 -05003509 rbd_dev->header.crypt_type = 0;
3510 rbd_dev->header.comp_type = 0;
3511
3512 /* Get the snapshot context, plus the header version */
3513
3514 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003515 if (ret)
3516 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003517 rbd_dev->header.obj_version = ver;
3518
Alex Eldera30b71b2012-07-10 20:30:11 -05003519 rbd_dev->image_format = 2;
3520
3521 dout("discovered version 2 image, header name is %s\n",
3522 rbd_dev->header_name);
3523
Alex Elder35152972012-08-31 17:29:55 -05003524 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003525out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003526 rbd_dev->parent_overlap = 0;
3527 rbd_spec_put(rbd_dev->parent_spec);
3528 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003529 kfree(rbd_dev->header_name);
3530 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003531 kfree(rbd_dev->header.object_prefix);
3532 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003533
3534 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003535}
3536
Alex Elder83a06262012-10-30 15:47:17 -05003537static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3538{
3539 int ret;
3540
3541 /* no need to lock here, as rbd_dev is not registered yet */
3542 ret = rbd_dev_snaps_update(rbd_dev);
3543 if (ret)
3544 return ret;
3545
Alex Elder9e15b772012-10-30 19:40:33 -05003546 ret = rbd_dev_probe_update_spec(rbd_dev);
3547 if (ret)
3548 goto err_out_snaps;
3549
Alex Elder83a06262012-10-30 15:47:17 -05003550 ret = rbd_dev_set_mapping(rbd_dev);
3551 if (ret)
3552 goto err_out_snaps;
3553
3554 /* generate unique id: find highest unique id, add one */
3555 rbd_dev_id_get(rbd_dev);
3556
3557 /* Fill in the device name, now that we have its id. */
3558 BUILD_BUG_ON(DEV_NAME_LEN
3559 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3560 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3561
3562 /* Get our block major device number. */
3563
3564 ret = register_blkdev(0, rbd_dev->name);
3565 if (ret < 0)
3566 goto err_out_id;
3567 rbd_dev->major = ret;
3568
3569 /* Set up the blkdev mapping. */
3570
3571 ret = rbd_init_disk(rbd_dev);
3572 if (ret)
3573 goto err_out_blkdev;
3574
3575 ret = rbd_bus_add_dev(rbd_dev);
3576 if (ret)
3577 goto err_out_disk;
3578
3579 /*
3580 * At this point cleanup in the event of an error is the job
3581 * of the sysfs code (initiated by rbd_bus_del_dev()).
3582 */
3583 down_write(&rbd_dev->header_rwsem);
3584 ret = rbd_dev_snaps_register(rbd_dev);
3585 up_write(&rbd_dev->header_rwsem);
3586 if (ret)
3587 goto err_out_bus;
3588
3589 ret = rbd_init_watch_dev(rbd_dev);
3590 if (ret)
3591 goto err_out_bus;
3592
3593 /* Everything's ready. Announce the disk to the world. */
3594
3595 add_disk(rbd_dev->disk);
3596
3597 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3598 (unsigned long long) rbd_dev->mapping.size);
3599
3600 return ret;
3601err_out_bus:
3602 /* this will also clean up rest of rbd_dev stuff */
3603
3604 rbd_bus_del_dev(rbd_dev);
3605
3606 return ret;
3607err_out_disk:
3608 rbd_free_disk(rbd_dev);
3609err_out_blkdev:
3610 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3611err_out_id:
3612 rbd_dev_id_put(rbd_dev);
3613err_out_snaps:
3614 rbd_remove_all_snaps(rbd_dev);
3615
3616 return ret;
3617}
3618
Alex Eldera30b71b2012-07-10 20:30:11 -05003619/*
3620 * Probe for the existence of the header object for the given rbd
3621 * device. For format 2 images this includes determining the image
3622 * id.
3623 */
3624static int rbd_dev_probe(struct rbd_device *rbd_dev)
3625{
3626 int ret;
3627
3628 /*
3629 * Get the id from the image id object. If it's not a
3630 * format 2 image, we'll get ENOENT back, and we'll assume
3631 * it's a format 1 image.
3632 */
3633 ret = rbd_dev_image_id(rbd_dev);
3634 if (ret)
3635 ret = rbd_dev_v1_probe(rbd_dev);
3636 else
3637 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003638 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003639 dout("probe failed, returning %d\n", ret);
3640
Alex Elder83a06262012-10-30 15:47:17 -05003641 return ret;
3642 }
3643
3644 ret = rbd_dev_probe_finish(rbd_dev);
3645 if (ret)
3646 rbd_header_free(&rbd_dev->header);
3647
Alex Eldera30b71b2012-07-10 20:30:11 -05003648 return ret;
3649}
3650
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003651static ssize_t rbd_add(struct bus_type *bus,
3652 const char *buf,
3653 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003654{
Alex Eldercb8627c2012-07-09 21:04:23 -05003655 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003656 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003657 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003658 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003659 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003660 struct ceph_osd_client *osdc;
3661 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003662
3663 if (!try_module_get(THIS_MODULE))
3664 return -ENODEV;
3665
Alex Eldera725f65e2012-02-02 08:13:30 -06003666 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003667 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003668 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003669 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003670
Alex Elder9d3997f2012-10-25 23:34:42 -05003671 rbdc = rbd_get_client(ceph_opts);
3672 if (IS_ERR(rbdc)) {
3673 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003674 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003675 }
Alex Elderc53d5892012-10-25 23:34:42 -05003676 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003677
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003678 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003679 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003680 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003681 if (rc < 0)
3682 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003683 spec->pool_id = (u64) rc;
3684
Alex Elder0903e872012-11-14 12:25:19 -06003685 /* The ceph file layout needs to fit pool id in 32 bits */
3686
3687 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
3688 rc = -EIO;
3689 goto err_out_client;
3690 }
3691
Alex Elderc53d5892012-10-25 23:34:42 -05003692 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003693 if (!rbd_dev)
3694 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05003695 rbdc = NULL; /* rbd_dev now owns this */
3696 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003697
Alex Elderbd4ba652012-10-25 23:34:42 -05003698 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05003699 kfree(rbd_opts);
3700 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05003701
Alex Eldera30b71b2012-07-10 20:30:11 -05003702 rc = rbd_dev_probe(rbd_dev);
3703 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05003704 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05003705
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003706 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05003707err_out_rbd_dev:
3708 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05003709err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05003710 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003711err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003712 if (ceph_opts)
3713 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003714 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003715 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003716err_out_module:
3717 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003718
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003719 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06003720
3721 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003722}
3723
Alex Elderde71a292012-07-03 16:01:19 -05003724static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003725{
3726 struct list_head *tmp;
3727 struct rbd_device *rbd_dev;
3728
Alex Eldere124a822012-01-29 13:57:44 -06003729 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003730 list_for_each(tmp, &rbd_dev_list) {
3731 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003732 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003733 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003734 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003735 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003736 }
Alex Eldere124a822012-01-29 13:57:44 -06003737 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003738 return NULL;
3739}
3740
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003741static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003742{
Alex Elder593a9e72012-02-07 12:03:37 -06003743 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003744
Alex Elder1dbb4392012-01-24 10:08:37 -06003745 if (rbd_dev->watch_request) {
3746 struct ceph_client *client = rbd_dev->rbd_client->client;
3747
3748 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003749 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003750 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003751 if (rbd_dev->watch_event)
Alex Elder907703d2012-11-13 21:11:15 -06003752 rbd_req_sync_watch(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003753
3754 /* clean up and free blkdev */
3755 rbd_free_disk(rbd_dev);
3756 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003757
Alex Elder2ac4e752012-07-10 20:30:10 -05003758 /* release allocated disk header fields */
3759 rbd_header_free(&rbd_dev->header);
3760
Alex Elder32eec682012-02-08 16:11:14 -06003761 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05003762 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05003763 rbd_assert(rbd_dev->rbd_client != NULL);
3764 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003765
3766 /* release module ref */
3767 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003768}
3769
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003770static ssize_t rbd_remove(struct bus_type *bus,
3771 const char *buf,
3772 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003773{
3774 struct rbd_device *rbd_dev = NULL;
3775 int target_id, rc;
3776 unsigned long ul;
3777 int ret = count;
3778
3779 rc = strict_strtoul(buf, 10, &ul);
3780 if (rc)
3781 return rc;
3782
3783 /* convert to int; abort if we lost anything in the conversion */
3784 target_id = (int) ul;
3785 if (target_id != ul)
3786 return -EINVAL;
3787
3788 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3789
3790 rbd_dev = __rbd_get_dev(target_id);
3791 if (!rbd_dev) {
3792 ret = -ENOENT;
3793 goto done;
3794 }
3795
Alex Elder42382b72012-11-16 09:29:16 -06003796 if (rbd_dev->open_count) {
3797 ret = -EBUSY;
3798 goto done;
3799 }
3800
Alex Elder41f38c22012-10-25 23:34:40 -05003801 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003802 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003803
3804done:
3805 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05003806
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003807 return ret;
3808}
3809
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003810/*
3811 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003812 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003813 */
3814static int rbd_sysfs_init(void)
3815{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003816 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003817
Alex Elderfed4c142012-02-07 12:03:36 -06003818 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003819 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003820 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003821
Alex Elderfed4c142012-02-07 12:03:36 -06003822 ret = bus_register(&rbd_bus_type);
3823 if (ret < 0)
3824 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003825
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003826 return ret;
3827}
3828
3829static void rbd_sysfs_cleanup(void)
3830{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003831 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003832 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003833}
3834
3835int __init rbd_init(void)
3836{
3837 int rc;
3838
3839 rc = rbd_sysfs_init();
3840 if (rc)
3841 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003842 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003843 return 0;
3844}
3845
3846void __exit rbd_exit(void)
3847{
3848 rbd_sysfs_cleanup();
3849}
3850
3851module_init(rbd_init);
3852module_exit(rbd_exit);
3853
3854MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3855MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3856MODULE_DESCRIPTION("rados block device");
3857
3858/* following authorship retained from original osdblk.c */
3859MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3860
3861MODULE_LICENSE("GPL");