blob: 4daa400c13aa6fafbb09a9a78bfac7262506bded [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder9e15b772012-10-30 19:40:33 -050073/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050075#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050076
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050078
Alex Elderd8891402012-10-09 13:50:17 -070079/* Feature bits */
80
81#define RBD_FEATURE_LAYERING 1
82
83/* Features supported by this (client software) implementation. */
84
85#define RBD_FEATURES_ALL (0)
86
Alex Elder81a89792012-02-02 08:13:30 -060087/*
88 * An RBD device name will be "rbd#", where the "rbd" comes from
89 * RBD_DRV_NAME above, and # is a unique integer identifier.
90 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
91 * enough to hold all possible device names.
92 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060094#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095
Alex Eldercc0538b2012-08-10 13:12:07 -070096#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070097
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122 * identify an image.
123 */
124struct rbd_spec {
125 u64 pool_id;
126 char *pool_name;
127
128 char *image_id;
129 size_t image_id_len;
130 char *image_name;
131 size_t image_name_len;
132
133 u64 snap_id;
134 char *snap_name;
135
136 struct kref kref;
137};
138
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700139struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700140 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700141};
142
143/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600144 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145 */
146struct rbd_client {
147 struct ceph_client *client;
148 struct kref kref;
149 struct list_head node;
150};
151
152/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600153 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700155struct rbd_req_status {
156 int done;
157 int rc;
158 u64 bytes;
159};
160
161/*
162 * a collection of requests
163 */
164struct rbd_req_coll {
165 int total;
166 int num_done;
167 struct kref kref;
168 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169};
170
Alex Elderf0f8cef2012-01-29 13:57:44 -0600171/*
172 * a single io request
173 */
174struct rbd_request {
175 struct request *rq; /* blk layer request */
176 struct bio *bio; /* cloned bio */
177 struct page **pages; /* list of used pages */
178 u64 len;
179 int coll_index;
180 struct rbd_req_coll *coll;
181};
182
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183struct rbd_snap {
184 struct device dev;
185 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800186 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800187 struct list_head node;
188 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500189 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800190};
191
Alex Elderf84344f2012-08-31 17:29:51 -0500192struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500193 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500194 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500195 bool read_only;
196};
197
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198/*
199 * a single device
200 */
201struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500202 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700203
204 int major; /* blkdev assigned major */
205 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700206
Alex Eldera30b71b2012-07-10 20:30:11 -0500207 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700208 struct rbd_client *rbd_client;
209
210 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
211
212 spinlock_t lock; /* queue lock */
213
214 struct rbd_image_header header;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500215 bool exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500216 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700217
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500218 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500219
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700220 struct ceph_osd_event *watch_event;
221 struct ceph_osd_request *watch_request;
222
Alex Elder86b00e02012-10-25 23:34:42 -0500223 struct rbd_spec *parent_spec;
224 u64 parent_overlap;
225
Josh Durginc6666012011-11-21 17:11:12 -0800226 /* protects updating the header */
227 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500228
229 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700230
231 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232
233 /* list of snapshots */
234 struct list_head snaps;
235
236 /* sysfs related */
237 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600238 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239};
240
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700241static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600242
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700243static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600244static DEFINE_SPINLOCK(rbd_dev_list_lock);
245
Alex Elder432b8582012-01-29 13:57:44 -0600246static LIST_HEAD(rbd_client_list); /* clients */
247static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248
Alex Elder304f6802012-08-31 17:29:52 -0500249static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
250static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
251
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800252static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500253static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800254
Alex Elderf0f8cef2012-01-29 13:57:44 -0600255static ssize_t rbd_add(struct bus_type *bus, const char *buf,
256 size_t count);
257static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
258 size_t count);
259
260static struct bus_attribute rbd_bus_attrs[] = {
261 __ATTR(add, S_IWUSR, NULL, rbd_add),
262 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
263 __ATTR_NULL
264};
265
266static struct bus_type rbd_bus_type = {
267 .name = "rbd",
268 .bus_attrs = rbd_bus_attrs,
269};
270
271static void rbd_root_dev_release(struct device *dev)
272{
273}
274
275static struct device rbd_root_dev = {
276 .init_name = "rbd",
277 .release = rbd_root_dev_release,
278};
279
Alex Elderaafb2302012-09-06 16:00:54 -0500280#ifdef RBD_DEBUG
281#define rbd_assert(expr) \
282 if (unlikely(!(expr))) { \
283 printk(KERN_ERR "\nAssertion failure in %s() " \
284 "at line %d:\n\n" \
285 "\trbd_assert(%s);\n\n", \
286 __func__, __LINE__, #expr); \
287 BUG(); \
288 }
289#else /* !RBD_DEBUG */
290# define rbd_assert(expr) ((void) 0)
291#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800292
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800293static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
294{
295 return get_device(&rbd_dev->dev);
296}
297
298static void rbd_put_dev(struct rbd_device *rbd_dev)
299{
300 put_device(&rbd_dev->dev);
301}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302
Alex Elder117973f2012-08-31 17:29:55 -0500303static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
304static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700305
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306static int rbd_open(struct block_device *bdev, fmode_t mode)
307{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600308 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309
Alex Elderf84344f2012-08-31 17:29:51 -0500310 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700311 return -EROFS;
312
Alex Elder42382b72012-11-16 09:29:16 -0600313 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder340c7a22012-08-10 13:12:07 -0700314 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500315 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600316 rbd_dev->open_count++;
317 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700318
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700319 return 0;
320}
321
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800322static int rbd_release(struct gendisk *disk, fmode_t mode)
323{
324 struct rbd_device *rbd_dev = disk->private_data;
325
Alex Elder42382b72012-11-16 09:29:16 -0600326 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
327 rbd_assert(rbd_dev->open_count > 0);
328 rbd_dev->open_count--;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800329 rbd_put_dev(rbd_dev);
Alex Elder42382b72012-11-16 09:29:16 -0600330 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800331
332 return 0;
333}
334
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700335static const struct block_device_operations rbd_bd_ops = {
336 .owner = THIS_MODULE,
337 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800338 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339};
340
341/*
342 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500343 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344 */
Alex Elderf8c38922012-08-10 13:12:07 -0700345static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346{
347 struct rbd_client *rbdc;
348 int ret = -ENOMEM;
349
350 dout("rbd_client_create\n");
351 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
352 if (!rbdc)
353 goto out_opt;
354
355 kref_init(&rbdc->kref);
356 INIT_LIST_HEAD(&rbdc->node);
357
Alex Elderbc534d82012-01-29 13:57:44 -0600358 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
359
Alex Elder43ae4702012-07-03 16:01:18 -0500360 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700361 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600362 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500363 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364
365 ret = ceph_open_session(rbdc->client);
366 if (ret < 0)
367 goto out_err;
368
Alex Elder432b8582012-01-29 13:57:44 -0600369 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700370 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600371 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700372
Alex Elderbc534d82012-01-29 13:57:44 -0600373 mutex_unlock(&ctl_mutex);
374
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700375 dout("rbd_client_create created %p\n", rbdc);
376 return rbdc;
377
378out_err:
379 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600380out_mutex:
381 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382 kfree(rbdc);
383out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500384 if (ceph_opts)
385 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400386 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700387}
388
389/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700390 * Find a ceph client with specific addr and configuration. If
391 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700392 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700393static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394{
395 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700396 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700397
Alex Elder43ae4702012-07-03 16:01:18 -0500398 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700399 return NULL;
400
Alex Elder1f7ba332012-08-10 13:12:07 -0700401 spin_lock(&rbd_client_list_lock);
402 list_for_each_entry(client_node, &rbd_client_list, node) {
403 if (!ceph_compare_options(ceph_opts, client_node->client)) {
404 kref_get(&client_node->kref);
405 found = true;
406 break;
407 }
408 }
409 spin_unlock(&rbd_client_list_lock);
410
411 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700412}
413
414/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700415 * mount options
416 */
417enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700418 Opt_last_int,
419 /* int args above */
420 Opt_last_string,
421 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700422 Opt_read_only,
423 Opt_read_write,
424 /* Boolean args above */
425 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700426};
427
Alex Elder43ae4702012-07-03 16:01:18 -0500428static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700429 /* int args above */
430 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500431 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700432 {Opt_read_only, "ro"}, /* Alternate spelling */
433 {Opt_read_write, "read_write"},
434 {Opt_read_write, "rw"}, /* Alternate spelling */
435 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700436 {-1, NULL}
437};
438
439static int parse_rbd_opts_token(char *c, void *private)
440{
Alex Elder43ae4702012-07-03 16:01:18 -0500441 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700442 substring_t argstr[MAX_OPT_ARGS];
443 int token, intval, ret;
444
Alex Elder43ae4702012-07-03 16:01:18 -0500445 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700446 if (token < 0)
447 return -EINVAL;
448
449 if (token < Opt_last_int) {
450 ret = match_int(&argstr[0], &intval);
451 if (ret < 0) {
452 pr_err("bad mount option arg (not int) "
453 "at '%s'\n", c);
454 return ret;
455 }
456 dout("got int token %d val %d\n", token, intval);
457 } else if (token > Opt_last_int && token < Opt_last_string) {
458 dout("got string token %d val %s\n", token,
459 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700460 } else if (token > Opt_last_string && token < Opt_last_bool) {
461 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700462 } else {
463 dout("got token %d\n", token);
464 }
465
466 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700467 case Opt_read_only:
468 rbd_opts->read_only = true;
469 break;
470 case Opt_read_write:
471 rbd_opts->read_only = false;
472 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700473 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500474 rbd_assert(false);
475 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700476 }
477 return 0;
478}
479
480/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481 * Get a ceph client with specific addr and configuration, if one does
482 * not exist create it.
483 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500484static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700485{
Alex Elderf8c38922012-08-10 13:12:07 -0700486 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700487
Alex Elder1f7ba332012-08-10 13:12:07 -0700488 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500489 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500490 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500491 else
Alex Elderf8c38922012-08-10 13:12:07 -0700492 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700493
Alex Elder9d3997f2012-10-25 23:34:42 -0500494 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495}
496
497/*
498 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600499 *
Alex Elder432b8582012-01-29 13:57:44 -0600500 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700501 */
502static void rbd_client_release(struct kref *kref)
503{
504 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
505
506 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500507 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700508 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500509 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700510
511 ceph_destroy_client(rbdc->client);
512 kfree(rbdc);
513}
514
515/*
516 * Drop reference to ceph client node. If it's not referenced anymore, release
517 * it.
518 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500519static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700520{
Alex Elderc53d5892012-10-25 23:34:42 -0500521 if (rbdc)
522 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700523}
524
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700525/*
526 * Destroy requests collection
527 */
528static void rbd_coll_release(struct kref *kref)
529{
530 struct rbd_req_coll *coll =
531 container_of(kref, struct rbd_req_coll, kref);
532
533 dout("rbd_coll_release %p\n", coll);
534 kfree(coll);
535}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700536
Alex Eldera30b71b2012-07-10 20:30:11 -0500537static bool rbd_image_format_valid(u32 image_format)
538{
539 return image_format == 1 || image_format == 2;
540}
541
Alex Elder8e94af82012-07-25 09:32:40 -0500542static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
543{
Alex Elder103a1502012-08-02 11:29:45 -0500544 size_t size;
545 u32 snap_count;
546
547 /* The header has to start with the magic rbd header text */
548 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
549 return false;
550
Alex Elderdb2388b2012-10-20 22:17:27 -0500551 /* The bio layer requires at least sector-sized I/O */
552
553 if (ondisk->options.order < SECTOR_SHIFT)
554 return false;
555
556 /* If we use u64 in a few spots we may be able to loosen this */
557
558 if (ondisk->options.order > 8 * sizeof (int) - 1)
559 return false;
560
Alex Elder103a1502012-08-02 11:29:45 -0500561 /*
562 * The size of a snapshot header has to fit in a size_t, and
563 * that limits the number of snapshots.
564 */
565 snap_count = le32_to_cpu(ondisk->snap_count);
566 size = SIZE_MAX - sizeof (struct ceph_snap_context);
567 if (snap_count > size / sizeof (__le64))
568 return false;
569
570 /*
571 * Not only that, but the size of the entire the snapshot
572 * header must also be representable in a size_t.
573 */
574 size -= snap_count * sizeof (__le64);
575 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
576 return false;
577
578 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500579}
580
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700581/*
582 * Create a new header structure, translate header format from the on-disk
583 * header.
584 */
585static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500586 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587{
Alex Elderccece232012-07-10 20:30:10 -0500588 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500589 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500590 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500591 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592
Alex Elder6a523252012-07-19 17:12:59 -0500593 memset(header, 0, sizeof (*header));
594
Alex Elder103a1502012-08-02 11:29:45 -0500595 snap_count = le32_to_cpu(ondisk->snap_count);
596
Alex Elder58c17b02012-08-23 23:22:06 -0500597 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
598 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500599 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500601 memcpy(header->object_prefix, ondisk->object_prefix, len);
602 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600603
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700604 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500605 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
606
Alex Elder621901d2012-08-23 23:22:06 -0500607 /* Save a copy of the snapshot names */
608
Alex Elderf785cc12012-08-23 23:22:06 -0500609 if (snap_names_len > (u64) SIZE_MAX)
610 return -EIO;
611 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500613 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500614 /*
615 * Note that rbd_dev_v1_header_read() guarantees
616 * the ondisk buffer we're working with has
617 * snap_names_len bytes beyond the end of the
618 * snapshot id array, this memcpy() is safe.
619 */
620 memcpy(header->snap_names, &ondisk->snaps[snap_count],
621 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500622
Alex Elder621901d2012-08-23 23:22:06 -0500623 /* Record each snapshot's size */
624
Alex Elderd2bb24e2012-07-26 23:37:14 -0500625 size = snap_count * sizeof (*header->snap_sizes);
626 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700627 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500628 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500629 for (i = 0; i < snap_count; i++)
630 header->snap_sizes[i] =
631 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632 } else {
Alex Elderccece232012-07-10 20:30:10 -0500633 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700634 header->snap_names = NULL;
635 header->snap_sizes = NULL;
636 }
Alex Elder849b4262012-07-09 21:04:24 -0500637
Alex Elder34b13182012-07-13 20:35:12 -0500638 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639 header->obj_order = ondisk->options.order;
640 header->crypt_type = ondisk->options.crypt_type;
641 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500642
Alex Elder621901d2012-08-23 23:22:06 -0500643 /* Allocate and fill in the snapshot context */
644
Alex Elderf84344f2012-08-31 17:29:51 -0500645 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500646 size = sizeof (struct ceph_snap_context);
647 size += snap_count * sizeof (header->snapc->snaps[0]);
648 header->snapc = kzalloc(size, GFP_KERNEL);
649 if (!header->snapc)
650 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651
652 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500653 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700654 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500655 for (i = 0; i < snap_count; i++)
656 header->snapc->snaps[i] =
657 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658
659 return 0;
660
Alex Elder6a523252012-07-19 17:12:59 -0500661out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500662 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500663 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700664 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500665 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500666 kfree(header->object_prefix);
667 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500668
Alex Elder00f1f362012-02-07 12:03:36 -0600669 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700670}
671
Alex Elder9e15b772012-10-30 19:40:33 -0500672static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
673{
674 struct rbd_snap *snap;
675
676 if (snap_id == CEPH_NOSNAP)
677 return RBD_SNAP_HEAD_NAME;
678
679 list_for_each_entry(snap, &rbd_dev->snaps, node)
680 if (snap_id == snap->id)
681 return snap->name;
682
683 return NULL;
684}
685
Alex Elder8836b992012-08-30 14:42:15 -0500686static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700688
Alex Eldere86924a2012-07-10 20:30:11 -0500689 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600690
Alex Eldere86924a2012-07-10 20:30:11 -0500691 list_for_each_entry(snap, &rbd_dev->snaps, node) {
692 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500693 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500694 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500695 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600696
Alex Eldere86924a2012-07-10 20:30:11 -0500697 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600698 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 }
Alex Eldere86924a2012-07-10 20:30:11 -0500700
Alex Elder00f1f362012-02-07 12:03:36 -0600701 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702}
703
Alex Elder819d52b2012-10-25 23:34:41 -0500704static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705{
Alex Elder78dc4472012-07-19 08:49:18 -0500706 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500708 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800709 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500710 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500711 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500712 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500713 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700714 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500715 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716 if (ret < 0)
717 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500718 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 }
Alex Elderdaba5fd2012-10-26 17:25:23 -0500720 rbd_dev->exists = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700722 return ret;
723}
724
725static void rbd_header_free(struct rbd_image_header *header)
726{
Alex Elder849b4262012-07-09 21:04:24 -0500727 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500728 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500730 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500731 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500732 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800733 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500734 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735}
736
Alex Elder65ccfe22012-08-09 10:33:26 -0700737static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738{
Alex Elder65ccfe22012-08-09 10:33:26 -0700739 char *name;
740 u64 segment;
741 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742
Alex Elder2fd82b92012-11-09 15:05:54 -0600743 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700744 if (!name)
745 return NULL;
746 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600747 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700748 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600749 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700750 pr_err("error formatting segment name for #%llu (%d)\n",
751 segment, ret);
752 kfree(name);
753 name = NULL;
754 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700755
Alex Elder65ccfe22012-08-09 10:33:26 -0700756 return name;
757}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758
Alex Elder65ccfe22012-08-09 10:33:26 -0700759static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
760{
761 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700762
Alex Elder65ccfe22012-08-09 10:33:26 -0700763 return offset & (segment_size - 1);
764}
765
766static u64 rbd_segment_length(struct rbd_device *rbd_dev,
767 u64 offset, u64 length)
768{
769 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
770
771 offset &= segment_size - 1;
772
Alex Elderaafb2302012-09-06 16:00:54 -0500773 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700774 if (offset + length > segment_size)
775 length = segment_size - offset;
776
777 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700778}
779
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700780static int rbd_get_num_segments(struct rbd_image_header *header,
781 u64 ofs, u64 len)
782{
Alex Elderdf111be2012-08-09 10:33:26 -0700783 u64 start_seg;
784 u64 end_seg;
785
786 if (!len)
787 return 0;
788 if (len - 1 > U64_MAX - ofs)
789 return -ERANGE;
790
791 start_seg = ofs >> header->obj_order;
792 end_seg = (ofs + len - 1) >> header->obj_order;
793
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700794 return end_seg - start_seg + 1;
795}
796
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700797/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700798 * returns the size of an object in the image
799 */
800static u64 rbd_obj_bytes(struct rbd_image_header *header)
801{
802 return 1 << header->obj_order;
803}
804
805/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700806 * bio helpers
807 */
808
809static void bio_chain_put(struct bio *chain)
810{
811 struct bio *tmp;
812
813 while (chain) {
814 tmp = chain;
815 chain = chain->bi_next;
816 bio_put(tmp);
817 }
818}
819
820/*
821 * zeros a bio chain, starting at specific offset
822 */
823static void zero_bio_chain(struct bio *chain, int start_ofs)
824{
825 struct bio_vec *bv;
826 unsigned long flags;
827 void *buf;
828 int i;
829 int pos = 0;
830
831 while (chain) {
832 bio_for_each_segment(bv, chain, i) {
833 if (pos + bv->bv_len > start_ofs) {
834 int remainder = max(start_ofs - pos, 0);
835 buf = bvec_kmap_irq(bv, &flags);
836 memset(buf + remainder, 0,
837 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200838 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700839 }
840 pos += bv->bv_len;
841 }
842
843 chain = chain->bi_next;
844 }
845}
846
847/*
Alex Elderf7760da2012-10-20 22:17:27 -0500848 * Clone a portion of a bio, starting at the given byte offset
849 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700850 */
Alex Elderf7760da2012-10-20 22:17:27 -0500851static struct bio *bio_clone_range(struct bio *bio_src,
852 unsigned int offset,
853 unsigned int len,
854 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700855{
Alex Elderf7760da2012-10-20 22:17:27 -0500856 struct bio_vec *bv;
857 unsigned int resid;
858 unsigned short idx;
859 unsigned int voff;
860 unsigned short end_idx;
861 unsigned short vcnt;
862 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700863
Alex Elderf7760da2012-10-20 22:17:27 -0500864 /* Handle the easy case for the caller */
865
866 if (!offset && len == bio_src->bi_size)
867 return bio_clone(bio_src, gfpmask);
868
869 if (WARN_ON_ONCE(!len))
870 return NULL;
871 if (WARN_ON_ONCE(len > bio_src->bi_size))
872 return NULL;
873 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
874 return NULL;
875
876 /* Find first affected segment... */
877
878 resid = offset;
879 __bio_for_each_segment(bv, bio_src, idx, 0) {
880 if (resid < bv->bv_len)
881 break;
882 resid -= bv->bv_len;
883 }
884 voff = resid;
885
886 /* ...and the last affected segment */
887
888 resid += len;
889 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
890 if (resid <= bv->bv_len)
891 break;
892 resid -= bv->bv_len;
893 }
894 vcnt = end_idx - idx + 1;
895
896 /* Build the clone */
897
898 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
899 if (!bio)
900 return NULL; /* ENOMEM */
901
902 bio->bi_bdev = bio_src->bi_bdev;
903 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
904 bio->bi_rw = bio_src->bi_rw;
905 bio->bi_flags |= 1 << BIO_CLONED;
906
907 /*
908 * Copy over our part of the bio_vec, then update the first
909 * and last (or only) entries.
910 */
911 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
912 vcnt * sizeof (struct bio_vec));
913 bio->bi_io_vec[0].bv_offset += voff;
914 if (vcnt > 1) {
915 bio->bi_io_vec[0].bv_len -= voff;
916 bio->bi_io_vec[vcnt - 1].bv_len = resid;
917 } else {
918 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700919 }
920
Alex Elderf7760da2012-10-20 22:17:27 -0500921 bio->bi_vcnt = vcnt;
922 bio->bi_size = len;
923 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700924
Alex Elderf7760da2012-10-20 22:17:27 -0500925 return bio;
926}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700927
Alex Elderf7760da2012-10-20 22:17:27 -0500928/*
929 * Clone a portion of a bio chain, starting at the given byte offset
930 * into the first bio in the source chain and continuing for the
931 * number of bytes indicated. The result is another bio chain of
932 * exactly the given length, or a null pointer on error.
933 *
934 * The bio_src and offset parameters are both in-out. On entry they
935 * refer to the first source bio and the offset into that bio where
936 * the start of data to be cloned is located.
937 *
938 * On return, bio_src is updated to refer to the bio in the source
939 * chain that contains first un-cloned byte, and *offset will
940 * contain the offset of that byte within that bio.
941 */
942static struct bio *bio_chain_clone_range(struct bio **bio_src,
943 unsigned int *offset,
944 unsigned int len,
945 gfp_t gfpmask)
946{
947 struct bio *bi = *bio_src;
948 unsigned int off = *offset;
949 struct bio *chain = NULL;
950 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700951
Alex Elderf7760da2012-10-20 22:17:27 -0500952 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700953
Alex Elderf7760da2012-10-20 22:17:27 -0500954 if (!bi || off >= bi->bi_size || !len)
955 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956
Alex Elderf7760da2012-10-20 22:17:27 -0500957 end = &chain;
958 while (len) {
959 unsigned int bi_size;
960 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700961
Alex Elderf7760da2012-10-20 22:17:27 -0500962 if (!bi)
963 goto out_err; /* EINVAL; ran out of bio's */
964 bi_size = min_t(unsigned int, bi->bi_size - off, len);
965 bio = bio_clone_range(bi, off, bi_size, gfpmask);
966 if (!bio)
967 goto out_err; /* ENOMEM */
968
969 *end = bio;
970 end = &bio->bi_next;
971
972 off += bi_size;
973 if (off == bi->bi_size) {
974 bi = bi->bi_next;
975 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700976 }
Alex Elderf7760da2012-10-20 22:17:27 -0500977 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700978 }
Alex Elderf7760da2012-10-20 22:17:27 -0500979 *bio_src = bi;
980 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700981
Alex Elderf7760da2012-10-20 22:17:27 -0500982 return chain;
983out_err:
984 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700986 return NULL;
987}
988
989/*
990 * helpers for osd request op vectors.
991 */
Alex Elder57cfc102012-06-26 12:57:03 -0700992static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
993 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700994{
Alex Elder57cfc102012-06-26 12:57:03 -0700995 struct ceph_osd_req_op *ops;
996
997 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
998 if (!ops)
999 return NULL;
1000
1001 ops[0].op = opcode;
1002
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001003 /*
1004 * op extent offset and length will be set later on
1005 * in calc_raw_layout()
1006 */
Alex Elder57cfc102012-06-26 12:57:03 -07001007 ops[0].payload_len = payload_len;
1008
1009 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001010}
1011
1012static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1013{
1014 kfree(ops);
1015}
1016
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001017static void rbd_coll_end_req_index(struct request *rq,
1018 struct rbd_req_coll *coll,
1019 int index,
1020 int ret, u64 len)
1021{
1022 struct request_queue *q;
1023 int min, max, i;
1024
Alex Elderbd919d42012-07-13 20:35:11 -05001025 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1026 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001027
1028 if (!rq)
1029 return;
1030
1031 if (!coll) {
1032 blk_end_request(rq, ret, len);
1033 return;
1034 }
1035
1036 q = rq->q;
1037
1038 spin_lock_irq(q->queue_lock);
1039 coll->status[index].done = 1;
1040 coll->status[index].rc = ret;
1041 coll->status[index].bytes = len;
1042 max = min = coll->num_done;
1043 while (max < coll->total && coll->status[max].done)
1044 max++;
1045
1046 for (i = min; i<max; i++) {
1047 __blk_end_request(rq, coll->status[i].rc,
1048 coll->status[i].bytes);
1049 coll->num_done++;
1050 kref_put(&coll->kref, rbd_coll_release);
1051 }
1052 spin_unlock_irq(q->queue_lock);
1053}
1054
1055static void rbd_coll_end_req(struct rbd_request *req,
1056 int ret, u64 len)
1057{
1058 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1059}
1060
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001061/*
1062 * Send ceph osd request
1063 */
1064static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001065 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001066 struct ceph_snap_context *snapc,
1067 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001068 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001069 struct bio *bio,
1070 struct page **pages,
1071 int num_pages,
1072 int flags,
1073 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001074 struct rbd_req_coll *coll,
1075 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001076 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001077 struct ceph_msg *msg),
1078 struct ceph_osd_request **linger_req,
1079 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080{
1081 struct ceph_osd_request *req;
1082 struct ceph_file_layout *layout;
1083 int ret;
1084 u64 bno;
1085 struct timespec mtime = CURRENT_TIME;
1086 struct rbd_request *req_data;
1087 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001088 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001089
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001090 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001091 if (!req_data) {
1092 if (coll)
1093 rbd_coll_end_req_index(rq, coll, coll_index,
1094 -ENOMEM, len);
1095 return -ENOMEM;
1096 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001097
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001098 if (coll) {
1099 req_data->coll = coll;
1100 req_data->coll_index = coll_index;
1101 }
1102
Alex Elderf7760da2012-10-20 22:17:27 -05001103 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1104 object_name, (unsigned long long) ofs,
1105 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106
Alex Elder0ce1a792012-07-03 16:01:18 -05001107 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001108 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1109 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001110 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001111 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001112 goto done_pages;
1113 }
1114
1115 req->r_callback = rbd_cb;
1116
1117 req_data->rq = rq;
1118 req_data->bio = bio;
1119 req_data->pages = pages;
1120 req_data->len = len;
1121
1122 req->r_priv = req_data;
1123
1124 reqhead = req->r_request->front.iov_base;
1125 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1126
Alex Elderaded07e2012-07-03 16:01:18 -05001127 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001128 req->r_oid_len = strlen(req->r_oid);
1129
1130 layout = &req->r_file_layout;
1131 memset(layout, 0, sizeof(*layout));
1132 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1133 layout->fl_stripe_count = cpu_to_le32(1);
1134 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001135 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001136 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1137 req, ops);
1138 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001139
1140 ceph_osdc_build_request(req, ofs, &len,
1141 ops,
1142 snapc,
1143 &mtime,
1144 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001145
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001146 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001147 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001148 *linger_req = req;
1149 }
1150
Alex Elder1dbb4392012-01-24 10:08:37 -06001151 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001152 if (ret < 0)
1153 goto done_err;
1154
1155 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001156 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001157 if (ver)
1158 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001159 dout("reassert_ver=%llu\n",
1160 (unsigned long long)
1161 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162 ceph_osdc_put_request(req);
1163 }
1164 return ret;
1165
1166done_err:
1167 bio_chain_put(req_data->bio);
1168 ceph_osdc_put_request(req);
1169done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001170 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172 return ret;
1173}
1174
1175/*
1176 * Ceph osd op callback
1177 */
1178static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1179{
1180 struct rbd_request *req_data = req->r_priv;
1181 struct ceph_osd_reply_head *replyhead;
1182 struct ceph_osd_op *op;
1183 __s32 rc;
1184 u64 bytes;
1185 int read_op;
1186
1187 /* parse reply */
1188 replyhead = msg->front.iov_base;
1189 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1190 op = (void *)(replyhead + 1);
1191 rc = le32_to_cpu(replyhead->result);
1192 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001193 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001194
Alex Elderbd919d42012-07-13 20:35:11 -05001195 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1196 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001197
1198 if (rc == -ENOENT && read_op) {
1199 zero_bio_chain(req_data->bio, 0);
1200 rc = 0;
1201 } else if (rc == 0 && read_op && bytes < req_data->len) {
1202 zero_bio_chain(req_data->bio, bytes);
1203 bytes = req_data->len;
1204 }
1205
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001206 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001207
1208 if (req_data->bio)
1209 bio_chain_put(req_data->bio);
1210
1211 ceph_osdc_put_request(req);
1212 kfree(req_data);
1213}
1214
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001215static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1216{
1217 ceph_osdc_put_request(req);
1218}
1219
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001220/*
1221 * Do a synchronous ceph osd operation
1222 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001223static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001224 struct ceph_snap_context *snapc,
1225 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001227 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001228 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001229 u64 ofs, u64 inbound_size,
1230 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001231 struct ceph_osd_request **linger_req,
1232 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001233{
1234 int ret;
1235 struct page **pages;
1236 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001237
Alex Elderaafb2302012-09-06 16:00:54 -05001238 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239
Alex Elderf8d4de62012-07-03 16:01:19 -05001240 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001241 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001242 if (IS_ERR(pages))
1243 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001244
Alex Elder0ce1a792012-07-03 16:01:18 -05001245 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001246 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001247 pages, num_pages,
1248 flags,
1249 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001250 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001251 NULL,
1252 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001253 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001254 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255
Alex Elderf8d4de62012-07-03 16:01:19 -05001256 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1257 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001258
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001259done:
1260 ceph_release_page_vector(pages, num_pages);
1261 return ret;
1262}
1263
1264/*
1265 * Do an asynchronous ceph osd operation
1266 */
1267static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001268 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001269 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001270 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001271 struct bio *bio,
1272 struct rbd_req_coll *coll,
1273 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001274{
1275 char *seg_name;
1276 u64 seg_ofs;
1277 u64 seg_len;
1278 int ret;
1279 struct ceph_osd_req_op *ops;
1280 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001281 int opcode;
1282 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001283 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001284
Alex Elder65ccfe22012-08-09 10:33:26 -07001285 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001286 if (!seg_name)
1287 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001288 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1289 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001290
Alex Elderff2e4bb2012-10-10 18:59:29 -07001291 if (rq_data_dir(rq) == WRITE) {
1292 opcode = CEPH_OSD_OP_WRITE;
1293 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001294 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001295 payload_len = seg_len;
1296 } else {
1297 opcode = CEPH_OSD_OP_READ;
1298 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001299 snapc = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001300 snapid = rbd_dev->spec->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001301 payload_len = 0;
1302 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001303
Alex Elder57cfc102012-06-26 12:57:03 -07001304 ret = -ENOMEM;
1305 ops = rbd_create_rw_ops(1, opcode, payload_len);
1306 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001307 goto done;
1308
1309 /* we've taken care of segment sizes earlier when we
1310 cloned the bios. We should never have a segment
1311 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001312 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001313
1314 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1315 seg_name, seg_ofs, seg_len,
1316 bio,
1317 NULL, 0,
1318 flags,
1319 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001320 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001322
1323 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001324done:
1325 kfree(seg_name);
1326 return ret;
1327}
1328
1329/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001330 * Request sync osd read
1331 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001332static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001333 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001334 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001335 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001336 char *buf,
1337 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001338{
Alex Elder913d2fd2012-06-26 12:57:03 -07001339 struct ceph_osd_req_op *ops;
1340 int ret;
1341
1342 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1343 if (!ops)
1344 return -ENOMEM;
1345
1346 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001347 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001348 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001349 ops, object_name, ofs, len, buf, NULL, ver);
1350 rbd_destroy_ops(ops);
1351
1352 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001353}
1354
1355/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001356 * Request sync osd watch
1357 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001358static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001359 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001360 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001361{
1362 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001363 int ret;
1364
Alex Elder57cfc102012-06-26 12:57:03 -07001365 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1366 if (!ops)
1367 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001368
Josh Durgina71b8912011-12-05 18:10:44 -08001369 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001370 ops[0].watch.cookie = notify_id;
1371 ops[0].watch.flag = 0;
1372
Alex Elder0ce1a792012-07-03 16:01:18 -05001373 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001374 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001375 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001376 CEPH_OSD_FLAG_READ,
1377 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001378 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001379 rbd_simple_req_cb, 0, NULL);
1380
1381 rbd_destroy_ops(ops);
1382 return ret;
1383}
1384
1385static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1386{
Alex Elder0ce1a792012-07-03 16:01:18 -05001387 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001388 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001389 int rc;
1390
Alex Elder0ce1a792012-07-03 16:01:18 -05001391 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001392 return;
1393
Alex Elderbd919d42012-07-13 20:35:11 -05001394 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1395 rbd_dev->header_name, (unsigned long long) notify_id,
1396 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001397 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001398 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001399 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001400 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001401
Alex Elder7f0a24d2012-07-25 09:32:40 -05001402 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001403}
1404
1405/*
1406 * Request sync osd watch
1407 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001408static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001409{
1410 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001411 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001412 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001413
Alex Elder57cfc102012-06-26 12:57:03 -07001414 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1415 if (!ops)
1416 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001417
1418 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001419 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001420 if (ret < 0)
1421 goto fail;
1422
Alex Elder0e6f3222012-07-25 09:32:40 -05001423 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001424 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001425 ops[0].watch.flag = 1;
1426
Alex Elder0ce1a792012-07-03 16:01:18 -05001427 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001428 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001429 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1430 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001431 rbd_dev->header_name,
1432 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001433 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001434
1435 if (ret < 0)
1436 goto fail_event;
1437
1438 rbd_destroy_ops(ops);
1439 return 0;
1440
1441fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001442 ceph_osdc_cancel_event(rbd_dev->watch_event);
1443 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001444fail:
1445 rbd_destroy_ops(ops);
1446 return ret;
1447}
1448
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001449/*
1450 * Request sync osd unwatch
1451 */
Alex Elder070c6332012-07-25 09:32:41 -05001452static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001453{
1454 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001455 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001456
Alex Elder57cfc102012-06-26 12:57:03 -07001457 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1458 if (!ops)
1459 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001460
1461 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001462 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001463 ops[0].watch.flag = 0;
1464
Alex Elder0ce1a792012-07-03 16:01:18 -05001465 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001466 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001467 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1468 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001469 rbd_dev->header_name,
1470 0, 0, NULL, NULL, NULL);
1471
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001472
1473 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001474 ceph_osdc_cancel_event(rbd_dev->watch_event);
1475 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001476 return ret;
1477}
1478
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001479/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001480 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001481 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001482static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001483 const char *object_name,
1484 const char *class_name,
1485 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001486 const char *outbound,
1487 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001488 char *inbound,
1489 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001490 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001491 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001492{
1493 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001494 int class_name_len = strlen(class_name);
1495 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001496 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001497 int ret;
1498
Alex Elder3cb4a682012-06-26 12:57:03 -07001499 /*
1500 * Any input parameters required by the method we're calling
1501 * will be sent along with the class and method names as
1502 * part of the message payload. That data and its size are
1503 * supplied via the indata and indata_len fields (named from
1504 * the perspective of the server side) in the OSD request
1505 * operation.
1506 */
1507 payload_size = class_name_len + method_name_len + outbound_size;
1508 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001509 if (!ops)
1510 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511
Alex Elderaded07e2012-07-03 16:01:18 -05001512 ops[0].cls.class_name = class_name;
1513 ops[0].cls.class_len = (__u8) class_name_len;
1514 ops[0].cls.method_name = method_name;
1515 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001516 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001517 ops[0].cls.indata = outbound;
1518 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001519
Alex Elder0ce1a792012-07-03 16:01:18 -05001520 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001522 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001523 object_name, 0, inbound_size, inbound,
1524 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525
1526 rbd_destroy_ops(ops);
1527
1528 dout("cls_exec returned %d\n", ret);
1529 return ret;
1530}
1531
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001532static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1533{
1534 struct rbd_req_coll *coll =
1535 kzalloc(sizeof(struct rbd_req_coll) +
1536 sizeof(struct rbd_req_status) * num_reqs,
1537 GFP_ATOMIC);
1538
1539 if (!coll)
1540 return NULL;
1541 coll->total = num_reqs;
1542 kref_init(&coll->kref);
1543 return coll;
1544}
1545
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001546/*
1547 * block device queue callback
1548 */
1549static void rbd_rq_fn(struct request_queue *q)
1550{
1551 struct rbd_device *rbd_dev = q->queuedata;
1552 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553
Alex Elder00f1f362012-02-07 12:03:36 -06001554 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001556 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001557 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001559 int num_segs, cur_seg = 0;
1560 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001561 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001562 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001563
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001564 dout("fetched request\n");
1565
1566 /* filter out block requests we don't understand */
1567 if ((rq->cmd_type != REQ_TYPE_FS)) {
1568 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001569 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001570 }
1571
1572 /* deduce our operation (read, write) */
1573 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001574 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001575 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001576 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001577 }
1578
1579 spin_unlock_irq(q->queue_lock);
1580
Josh Durgind1d25642011-12-05 14:03:05 -08001581 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001582
Alex Elderdaba5fd2012-10-26 17:25:23 -05001583 if (!rbd_dev->exists) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001584 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
Josh Durgine88a36e2011-11-21 18:14:25 -08001585 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001586 dout("request for non-existent snapshot");
1587 spin_lock_irq(q->queue_lock);
1588 __blk_end_request_all(rq, -ENXIO);
1589 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001590 }
1591
Josh Durgind1d25642011-12-05 14:03:05 -08001592 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1593
1594 up_read(&rbd_dev->header_rwsem);
1595
Alex Elderf7760da2012-10-20 22:17:27 -05001596 size = blk_rq_bytes(rq);
1597 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1598 bio = rq->bio;
1599
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001600 dout("%s 0x%x bytes at 0x%llx\n",
1601 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001602 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001603
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001604 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001605 if (num_segs <= 0) {
1606 spin_lock_irq(q->queue_lock);
1607 __blk_end_request_all(rq, num_segs);
1608 ceph_put_snap_context(snapc);
1609 continue;
1610 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001611 coll = rbd_alloc_coll(num_segs);
1612 if (!coll) {
1613 spin_lock_irq(q->queue_lock);
1614 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001615 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001616 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001617 }
1618
Alex Elderf7760da2012-10-20 22:17:27 -05001619 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001620 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001621 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1622 unsigned int chain_size;
1623 struct bio *bio_chain;
1624
1625 BUG_ON(limit > (u64) UINT_MAX);
1626 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001627 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001628
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001629 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001630
1631 /* Pass a cloned bio chain via an osd request */
1632
1633 bio_chain = bio_chain_clone_range(&bio,
1634 &bio_offset, chain_size,
1635 GFP_ATOMIC);
1636 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001637 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001638 ofs, chain_size,
1639 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001640 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001641 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001642 -ENOMEM, chain_size);
1643 size -= chain_size;
1644 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001645
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001646 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001647 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001648 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001649
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001650 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001651
1652 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001653 }
1654}
1655
1656/*
1657 * a queue callback. Makes sure that we don't create a bio that spans across
1658 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001659 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001660 */
1661static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1662 struct bio_vec *bvec)
1663{
1664 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001665 sector_t sector_offset;
1666 sector_t sectors_per_obj;
1667 sector_t obj_sector_offset;
1668 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001669
Alex Eldere5cfeed22012-10-20 22:17:27 -05001670 /*
1671 * Find how far into its rbd object the partition-relative
1672 * bio start sector is to offset relative to the enclosing
1673 * device.
1674 */
1675 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1676 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1677 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001678
Alex Eldere5cfeed22012-10-20 22:17:27 -05001679 /*
1680 * Compute the number of bytes from that offset to the end
1681 * of the object. Account for what's already used by the bio.
1682 */
1683 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1684 if (ret > bmd->bi_size)
1685 ret -= bmd->bi_size;
1686 else
1687 ret = 0;
1688
1689 /*
1690 * Don't send back more than was asked for. And if the bio
1691 * was empty, let the whole thing through because: "Note
1692 * that a block device *must* allow a single page to be
1693 * added to an empty bio."
1694 */
1695 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1696 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1697 ret = (int) bvec->bv_len;
1698
1699 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001700}
1701
1702static void rbd_free_disk(struct rbd_device *rbd_dev)
1703{
1704 struct gendisk *disk = rbd_dev->disk;
1705
1706 if (!disk)
1707 return;
1708
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001709 if (disk->flags & GENHD_FL_UP)
1710 del_gendisk(disk);
1711 if (disk->queue)
1712 blk_cleanup_queue(disk->queue);
1713 put_disk(disk);
1714}
1715
1716/*
Alex Elder4156d992012-08-02 11:29:46 -05001717 * Read the complete header for the given rbd device.
1718 *
1719 * Returns a pointer to a dynamically-allocated buffer containing
1720 * the complete and validated header. Caller can pass the address
1721 * of a variable that will be filled in with the version of the
1722 * header object at the time it was read.
1723 *
1724 * Returns a pointer-coded errno if a failure occurs.
1725 */
1726static struct rbd_image_header_ondisk *
1727rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1728{
1729 struct rbd_image_header_ondisk *ondisk = NULL;
1730 u32 snap_count = 0;
1731 u64 names_size = 0;
1732 u32 want_count;
1733 int ret;
1734
1735 /*
1736 * The complete header will include an array of its 64-bit
1737 * snapshot ids, followed by the names of those snapshots as
1738 * a contiguous block of NUL-terminated strings. Note that
1739 * the number of snapshots could change by the time we read
1740 * it in, in which case we re-read it.
1741 */
1742 do {
1743 size_t size;
1744
1745 kfree(ondisk);
1746
1747 size = sizeof (*ondisk);
1748 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1749 size += names_size;
1750 ondisk = kmalloc(size, GFP_KERNEL);
1751 if (!ondisk)
1752 return ERR_PTR(-ENOMEM);
1753
1754 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1755 rbd_dev->header_name,
1756 0, size,
1757 (char *) ondisk, version);
1758
1759 if (ret < 0)
1760 goto out_err;
1761 if (WARN_ON((size_t) ret < size)) {
1762 ret = -ENXIO;
1763 pr_warning("short header read for image %s"
1764 " (want %zd got %d)\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001765 rbd_dev->spec->image_name, size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05001766 goto out_err;
1767 }
1768 if (!rbd_dev_ondisk_valid(ondisk)) {
1769 ret = -ENXIO;
1770 pr_warning("invalid header for image %s\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001771 rbd_dev->spec->image_name);
Alex Elder4156d992012-08-02 11:29:46 -05001772 goto out_err;
1773 }
1774
1775 names_size = le64_to_cpu(ondisk->snap_names_len);
1776 want_count = snap_count;
1777 snap_count = le32_to_cpu(ondisk->snap_count);
1778 } while (snap_count != want_count);
1779
1780 return ondisk;
1781
1782out_err:
1783 kfree(ondisk);
1784
1785 return ERR_PTR(ret);
1786}
1787
1788/*
1789 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001790 */
1791static int rbd_read_header(struct rbd_device *rbd_dev,
1792 struct rbd_image_header *header)
1793{
Alex Elder4156d992012-08-02 11:29:46 -05001794 struct rbd_image_header_ondisk *ondisk;
1795 u64 ver = 0;
1796 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001797
Alex Elder4156d992012-08-02 11:29:46 -05001798 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1799 if (IS_ERR(ondisk))
1800 return PTR_ERR(ondisk);
1801 ret = rbd_header_from_disk(header, ondisk);
1802 if (ret >= 0)
1803 header->obj_version = ver;
1804 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001805
Alex Elder4156d992012-08-02 11:29:46 -05001806 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001807}
1808
Alex Elder41f38c22012-10-25 23:34:40 -05001809static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001810{
1811 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001812 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001813
Alex Eldera0593292012-07-19 09:09:27 -05001814 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001815 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001816}
1817
Alex Elder94785542012-10-09 13:50:17 -07001818static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1819{
1820 sector_t size;
1821
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001822 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001823 return;
1824
1825 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1826 dout("setting size to %llu sectors", (unsigned long long) size);
1827 rbd_dev->mapping.size = (u64) size;
1828 set_capacity(rbd_dev->disk, size);
1829}
1830
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001831/*
1832 * only read the first part of the ondisk header, without the snaps info
1833 */
Alex Elder117973f2012-08-31 17:29:55 -05001834static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001835{
1836 int ret;
1837 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001838
1839 ret = rbd_read_header(rbd_dev, &h);
1840 if (ret < 0)
1841 return ret;
1842
Josh Durgina51aa0c2011-12-05 10:35:04 -08001843 down_write(&rbd_dev->header_rwsem);
1844
Alex Elder94785542012-10-09 13:50:17 -07001845 /* Update image size, and check for resize of mapped image */
1846 rbd_dev->header.image_size = h.image_size;
1847 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001848
Alex Elder849b4262012-07-09 21:04:24 -05001849 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001850 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001851 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001852 /* osd requests may still refer to snapc */
1853 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001854
Alex Elderb8136232012-07-25 09:32:41 -05001855 if (hver)
1856 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001857 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001858 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001859 rbd_dev->header.snapc = h.snapc;
1860 rbd_dev->header.snap_names = h.snap_names;
1861 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001862 /* Free the extra copy of the object prefix */
1863 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1864 kfree(h.object_prefix);
1865
Alex Elder304f6802012-08-31 17:29:52 -05001866 ret = rbd_dev_snaps_update(rbd_dev);
1867 if (!ret)
1868 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001869
Josh Durginc6666012011-11-21 17:11:12 -08001870 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001871
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001872 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001873}
1874
Alex Elder117973f2012-08-31 17:29:55 -05001875static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001876{
1877 int ret;
1878
Alex Elder117973f2012-08-31 17:29:55 -05001879 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001880 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001881 if (rbd_dev->image_format == 1)
1882 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1883 else
1884 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001885 mutex_unlock(&ctl_mutex);
1886
1887 return ret;
1888}
1889
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001890static int rbd_init_disk(struct rbd_device *rbd_dev)
1891{
1892 struct gendisk *disk;
1893 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001894 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001895
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001896 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001897 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1898 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001899 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001900
Alex Elderf0f8cef2012-01-29 13:57:44 -06001901 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001902 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001903 disk->major = rbd_dev->major;
1904 disk->first_minor = 0;
1905 disk->fops = &rbd_bd_ops;
1906 disk->private_data = rbd_dev;
1907
1908 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001909 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1910 if (!q)
1911 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001912
Alex Elder593a9e72012-02-07 12:03:37 -06001913 /* We use the default size, but let's be explicit about it. */
1914 blk_queue_physical_block_size(q, SECTOR_SIZE);
1915
Josh Durgin029bcbd2011-07-22 11:35:23 -07001916 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001917 segment_size = rbd_obj_bytes(&rbd_dev->header);
1918 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1919 blk_queue_max_segment_size(q, segment_size);
1920 blk_queue_io_min(q, segment_size);
1921 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001922
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001923 blk_queue_merge_bvec(q, rbd_merge_bvec);
1924 disk->queue = q;
1925
1926 q->queuedata = rbd_dev;
1927
1928 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001929
Alex Elder12f02942012-08-29 17:11:07 -05001930 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1931
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001932 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001933out_disk:
1934 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001935
1936 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001937}
1938
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001939/*
1940 sysfs
1941*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001942
Alex Elder593a9e72012-02-07 12:03:37 -06001943static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1944{
1945 return container_of(dev, struct rbd_device, dev);
1946}
1947
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001948static ssize_t rbd_size_show(struct device *dev,
1949 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001950{
Alex Elder593a9e72012-02-07 12:03:37 -06001951 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001952 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001953
Josh Durgina51aa0c2011-12-05 10:35:04 -08001954 down_read(&rbd_dev->header_rwsem);
1955 size = get_capacity(rbd_dev->disk);
1956 up_read(&rbd_dev->header_rwsem);
1957
1958 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001959}
1960
Alex Elder34b13182012-07-13 20:35:12 -05001961/*
1962 * Note this shows the features for whatever's mapped, which is not
1963 * necessarily the base image.
1964 */
1965static ssize_t rbd_features_show(struct device *dev,
1966 struct device_attribute *attr, char *buf)
1967{
1968 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1969
1970 return sprintf(buf, "0x%016llx\n",
1971 (unsigned long long) rbd_dev->mapping.features);
1972}
1973
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001974static ssize_t rbd_major_show(struct device *dev,
1975 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001976{
Alex Elder593a9e72012-02-07 12:03:37 -06001977 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001978
1979 return sprintf(buf, "%d\n", rbd_dev->major);
1980}
1981
1982static ssize_t rbd_client_id_show(struct device *dev,
1983 struct device_attribute *attr, char *buf)
1984{
Alex Elder593a9e72012-02-07 12:03:37 -06001985 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001986
Alex Elder1dbb4392012-01-24 10:08:37 -06001987 return sprintf(buf, "client%lld\n",
1988 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001989}
1990
1991static ssize_t rbd_pool_show(struct device *dev,
1992 struct device_attribute *attr, char *buf)
1993{
Alex Elder593a9e72012-02-07 12:03:37 -06001994 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001995
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001996 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001997}
1998
Alex Elder9bb2f332012-07-12 10:46:35 -05001999static ssize_t rbd_pool_id_show(struct device *dev,
2000 struct device_attribute *attr, char *buf)
2001{
2002 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2003
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002004 return sprintf(buf, "%llu\n",
2005 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002006}
2007
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002008static ssize_t rbd_name_show(struct device *dev,
2009 struct device_attribute *attr, char *buf)
2010{
Alex Elder593a9e72012-02-07 12:03:37 -06002011 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002012
Alex Eldera92ffdf2012-10-30 19:40:33 -05002013 if (rbd_dev->spec->image_name)
2014 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2015
2016 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002017}
2018
Alex Elder589d30e2012-07-10 20:30:11 -05002019static ssize_t rbd_image_id_show(struct device *dev,
2020 struct device_attribute *attr, char *buf)
2021{
2022 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2023
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002024 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002025}
2026
Alex Elder34b13182012-07-13 20:35:12 -05002027/*
2028 * Shows the name of the currently-mapped snapshot (or
2029 * RBD_SNAP_HEAD_NAME for the base image).
2030 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002031static ssize_t rbd_snap_show(struct device *dev,
2032 struct device_attribute *attr,
2033 char *buf)
2034{
Alex Elder593a9e72012-02-07 12:03:37 -06002035 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002036
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002037 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002038}
2039
Alex Elder86b00e02012-10-25 23:34:42 -05002040/*
2041 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2042 * for the parent image. If there is no parent, simply shows
2043 * "(no parent image)".
2044 */
2045static ssize_t rbd_parent_show(struct device *dev,
2046 struct device_attribute *attr,
2047 char *buf)
2048{
2049 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2050 struct rbd_spec *spec = rbd_dev->parent_spec;
2051 int count;
2052 char *bufp = buf;
2053
2054 if (!spec)
2055 return sprintf(buf, "(no parent image)\n");
2056
2057 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2058 (unsigned long long) spec->pool_id, spec->pool_name);
2059 if (count < 0)
2060 return count;
2061 bufp += count;
2062
2063 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2064 spec->image_name ? spec->image_name : "(unknown)");
2065 if (count < 0)
2066 return count;
2067 bufp += count;
2068
2069 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2070 (unsigned long long) spec->snap_id, spec->snap_name);
2071 if (count < 0)
2072 return count;
2073 bufp += count;
2074
2075 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2076 if (count < 0)
2077 return count;
2078 bufp += count;
2079
2080 return (ssize_t) (bufp - buf);
2081}
2082
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002083static ssize_t rbd_image_refresh(struct device *dev,
2084 struct device_attribute *attr,
2085 const char *buf,
2086 size_t size)
2087{
Alex Elder593a9e72012-02-07 12:03:37 -06002088 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002089 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002090
Alex Elder117973f2012-08-31 17:29:55 -05002091 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002092
2093 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002094}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002095
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002096static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002097static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002098static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2099static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2100static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002101static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002102static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002103static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002104static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2105static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002106static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002107
2108static struct attribute *rbd_attrs[] = {
2109 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002110 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002111 &dev_attr_major.attr,
2112 &dev_attr_client_id.attr,
2113 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002114 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002115 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002116 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002117 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002118 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002120 NULL
2121};
2122
2123static struct attribute_group rbd_attr_group = {
2124 .attrs = rbd_attrs,
2125};
2126
2127static const struct attribute_group *rbd_attr_groups[] = {
2128 &rbd_attr_group,
2129 NULL
2130};
2131
2132static void rbd_sysfs_dev_release(struct device *dev)
2133{
2134}
2135
2136static struct device_type rbd_device_type = {
2137 .name = "rbd",
2138 .groups = rbd_attr_groups,
2139 .release = rbd_sysfs_dev_release,
2140};
2141
2142
2143/*
2144 sysfs - snapshots
2145*/
2146
2147static ssize_t rbd_snap_size_show(struct device *dev,
2148 struct device_attribute *attr,
2149 char *buf)
2150{
2151 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2152
Josh Durgin35915382011-12-05 18:25:13 -08002153 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002154}
2155
2156static ssize_t rbd_snap_id_show(struct device *dev,
2157 struct device_attribute *attr,
2158 char *buf)
2159{
2160 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2161
Josh Durgin35915382011-12-05 18:25:13 -08002162 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002163}
2164
Alex Elder34b13182012-07-13 20:35:12 -05002165static ssize_t rbd_snap_features_show(struct device *dev,
2166 struct device_attribute *attr,
2167 char *buf)
2168{
2169 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2170
2171 return sprintf(buf, "0x%016llx\n",
2172 (unsigned long long) snap->features);
2173}
2174
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002175static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2176static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002177static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002178
2179static struct attribute *rbd_snap_attrs[] = {
2180 &dev_attr_snap_size.attr,
2181 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002182 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002183 NULL,
2184};
2185
2186static struct attribute_group rbd_snap_attr_group = {
2187 .attrs = rbd_snap_attrs,
2188};
2189
2190static void rbd_snap_dev_release(struct device *dev)
2191{
2192 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2193 kfree(snap->name);
2194 kfree(snap);
2195}
2196
2197static const struct attribute_group *rbd_snap_attr_groups[] = {
2198 &rbd_snap_attr_group,
2199 NULL
2200};
2201
2202static struct device_type rbd_snap_device_type = {
2203 .groups = rbd_snap_attr_groups,
2204 .release = rbd_snap_dev_release,
2205};
2206
Alex Elder8b8fb992012-10-26 17:25:24 -05002207static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2208{
2209 kref_get(&spec->kref);
2210
2211 return spec;
2212}
2213
2214static void rbd_spec_free(struct kref *kref);
2215static void rbd_spec_put(struct rbd_spec *spec)
2216{
2217 if (spec)
2218 kref_put(&spec->kref, rbd_spec_free);
2219}
2220
2221static struct rbd_spec *rbd_spec_alloc(void)
2222{
2223 struct rbd_spec *spec;
2224
2225 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2226 if (!spec)
2227 return NULL;
2228 kref_init(&spec->kref);
2229
2230 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2231
2232 return spec;
2233}
2234
2235static void rbd_spec_free(struct kref *kref)
2236{
2237 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2238
2239 kfree(spec->pool_name);
2240 kfree(spec->image_id);
2241 kfree(spec->image_name);
2242 kfree(spec->snap_name);
2243 kfree(spec);
2244}
2245
Alex Elderc53d5892012-10-25 23:34:42 -05002246struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2247 struct rbd_spec *spec)
2248{
2249 struct rbd_device *rbd_dev;
2250
2251 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2252 if (!rbd_dev)
2253 return NULL;
2254
2255 spin_lock_init(&rbd_dev->lock);
2256 INIT_LIST_HEAD(&rbd_dev->node);
2257 INIT_LIST_HEAD(&rbd_dev->snaps);
2258 init_rwsem(&rbd_dev->header_rwsem);
2259
2260 rbd_dev->spec = spec;
2261 rbd_dev->rbd_client = rbdc;
2262
2263 return rbd_dev;
2264}
2265
2266static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2267{
Alex Elder86b00e02012-10-25 23:34:42 -05002268 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002269 kfree(rbd_dev->header_name);
2270 rbd_put_client(rbd_dev->rbd_client);
2271 rbd_spec_put(rbd_dev->spec);
2272 kfree(rbd_dev);
2273}
2274
Alex Elder304f6802012-08-31 17:29:52 -05002275static bool rbd_snap_registered(struct rbd_snap *snap)
2276{
2277 bool ret = snap->dev.type == &rbd_snap_device_type;
2278 bool reg = device_is_registered(&snap->dev);
2279
2280 rbd_assert(!ret ^ reg);
2281
2282 return ret;
2283}
2284
Alex Elder41f38c22012-10-25 23:34:40 -05002285static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002286{
2287 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002288 if (device_is_registered(&snap->dev))
2289 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002290}
2291
Alex Elder14e70852012-07-19 09:09:27 -05002292static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002293 struct device *parent)
2294{
2295 struct device *dev = &snap->dev;
2296 int ret;
2297
2298 dev->type = &rbd_snap_device_type;
2299 dev->parent = parent;
2300 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002301 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002302 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2303
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002304 ret = device_register(dev);
2305
2306 return ret;
2307}
2308
Alex Elder4e891e02012-07-10 20:30:10 -05002309static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002310 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002311 u64 snap_id, u64 snap_size,
2312 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002313{
Alex Elder4e891e02012-07-10 20:30:10 -05002314 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002315 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002316
2317 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002318 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002319 return ERR_PTR(-ENOMEM);
2320
2321 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002322 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002323 if (!snap->name)
2324 goto err;
2325
Alex Elderc8d18422012-07-10 20:30:11 -05002326 snap->id = snap_id;
2327 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002328 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002329
2330 return snap;
2331
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002332err:
2333 kfree(snap->name);
2334 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002335
2336 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002337}
2338
Alex Eldercd892122012-07-03 16:01:19 -05002339static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2340 u64 *snap_size, u64 *snap_features)
2341{
2342 char *snap_name;
2343
2344 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2345
2346 *snap_size = rbd_dev->header.snap_sizes[which];
2347 *snap_features = 0; /* No features for v1 */
2348
2349 /* Skip over names until we find the one we are looking for */
2350
2351 snap_name = rbd_dev->header.snap_names;
2352 while (which--)
2353 snap_name += strlen(snap_name) + 1;
2354
2355 return snap_name;
2356}
2357
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002358/*
Alex Elder9d475de2012-07-03 16:01:19 -05002359 * Get the size and object order for an image snapshot, or if
2360 * snap_id is CEPH_NOSNAP, gets this information for the base
2361 * image.
2362 */
2363static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2364 u8 *order, u64 *snap_size)
2365{
2366 __le64 snapid = cpu_to_le64(snap_id);
2367 int ret;
2368 struct {
2369 u8 order;
2370 __le64 size;
2371 } __attribute__ ((packed)) size_buf = { 0 };
2372
2373 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2374 "rbd", "get_size",
2375 (char *) &snapid, sizeof (snapid),
2376 (char *) &size_buf, sizeof (size_buf),
2377 CEPH_OSD_FLAG_READ, NULL);
2378 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2379 if (ret < 0)
2380 return ret;
2381
2382 *order = size_buf.order;
2383 *snap_size = le64_to_cpu(size_buf.size);
2384
2385 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2386 (unsigned long long) snap_id, (unsigned int) *order,
2387 (unsigned long long) *snap_size);
2388
2389 return 0;
2390}
2391
2392static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2393{
2394 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2395 &rbd_dev->header.obj_order,
2396 &rbd_dev->header.image_size);
2397}
2398
Alex Elder1e130192012-07-03 16:01:19 -05002399static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2400{
2401 void *reply_buf;
2402 int ret;
2403 void *p;
2404
2405 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2406 if (!reply_buf)
2407 return -ENOMEM;
2408
2409 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2410 "rbd", "get_object_prefix",
2411 NULL, 0,
2412 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2413 CEPH_OSD_FLAG_READ, NULL);
2414 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2415 if (ret < 0)
2416 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002417 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002418
2419 p = reply_buf;
2420 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2421 p + RBD_OBJ_PREFIX_LEN_MAX,
2422 NULL, GFP_NOIO);
2423
2424 if (IS_ERR(rbd_dev->header.object_prefix)) {
2425 ret = PTR_ERR(rbd_dev->header.object_prefix);
2426 rbd_dev->header.object_prefix = NULL;
2427 } else {
2428 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2429 }
2430
2431out:
2432 kfree(reply_buf);
2433
2434 return ret;
2435}
2436
Alex Elderb1b54022012-07-03 16:01:19 -05002437static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2438 u64 *snap_features)
2439{
2440 __le64 snapid = cpu_to_le64(snap_id);
2441 struct {
2442 __le64 features;
2443 __le64 incompat;
2444 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002445 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002446 int ret;
2447
2448 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2449 "rbd", "get_features",
2450 (char *) &snapid, sizeof (snapid),
2451 (char *) &features_buf, sizeof (features_buf),
2452 CEPH_OSD_FLAG_READ, NULL);
2453 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2454 if (ret < 0)
2455 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002456
2457 incompat = le64_to_cpu(features_buf.incompat);
2458 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002459 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002460
Alex Elderb1b54022012-07-03 16:01:19 -05002461 *snap_features = le64_to_cpu(features_buf.features);
2462
2463 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2464 (unsigned long long) snap_id,
2465 (unsigned long long) *snap_features,
2466 (unsigned long long) le64_to_cpu(features_buf.incompat));
2467
2468 return 0;
2469}
2470
2471static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2472{
2473 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2474 &rbd_dev->header.features);
2475}
2476
Alex Elder86b00e02012-10-25 23:34:42 -05002477static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2478{
2479 struct rbd_spec *parent_spec;
2480 size_t size;
2481 void *reply_buf = NULL;
2482 __le64 snapid;
2483 void *p;
2484 void *end;
2485 char *image_id;
2486 u64 overlap;
2487 size_t len = 0;
2488 int ret;
2489
2490 parent_spec = rbd_spec_alloc();
2491 if (!parent_spec)
2492 return -ENOMEM;
2493
2494 size = sizeof (__le64) + /* pool_id */
2495 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2496 sizeof (__le64) + /* snap_id */
2497 sizeof (__le64); /* overlap */
2498 reply_buf = kmalloc(size, GFP_KERNEL);
2499 if (!reply_buf) {
2500 ret = -ENOMEM;
2501 goto out_err;
2502 }
2503
2504 snapid = cpu_to_le64(CEPH_NOSNAP);
2505 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2506 "rbd", "get_parent",
2507 (char *) &snapid, sizeof (snapid),
2508 (char *) reply_buf, size,
2509 CEPH_OSD_FLAG_READ, NULL);
2510 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2511 if (ret < 0)
2512 goto out_err;
2513
2514 ret = -ERANGE;
2515 p = reply_buf;
2516 end = (char *) reply_buf + size;
2517 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2518 if (parent_spec->pool_id == CEPH_NOPOOL)
2519 goto out; /* No parent? No problem. */
2520
2521 image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2522 if (IS_ERR(image_id)) {
2523 ret = PTR_ERR(image_id);
2524 goto out_err;
2525 }
2526 parent_spec->image_id = image_id;
Alex Elder9e15b772012-10-30 19:40:33 -05002527 parent_spec->image_id_len = len;
Alex Elder86b00e02012-10-25 23:34:42 -05002528 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2529 ceph_decode_64_safe(&p, end, overlap, out_err);
2530
2531 rbd_dev->parent_overlap = overlap;
2532 rbd_dev->parent_spec = parent_spec;
2533 parent_spec = NULL; /* rbd_dev now owns this */
2534out:
2535 ret = 0;
2536out_err:
2537 kfree(reply_buf);
2538 rbd_spec_put(parent_spec);
2539
2540 return ret;
2541}
2542
Alex Elder9e15b772012-10-30 19:40:33 -05002543static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2544{
2545 size_t image_id_size;
2546 char *image_id;
2547 void *p;
2548 void *end;
2549 size_t size;
2550 void *reply_buf = NULL;
2551 size_t len = 0;
2552 char *image_name = NULL;
2553 int ret;
2554
2555 rbd_assert(!rbd_dev->spec->image_name);
2556
2557 image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
2558 image_id = kmalloc(image_id_size, GFP_KERNEL);
2559 if (!image_id)
2560 return NULL;
2561
2562 p = image_id;
2563 end = (char *) image_id + image_id_size;
2564 ceph_encode_string(&p, end, rbd_dev->spec->image_id,
2565 (u32) rbd_dev->spec->image_id_len);
2566
2567 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2568 reply_buf = kmalloc(size, GFP_KERNEL);
2569 if (!reply_buf)
2570 goto out;
2571
2572 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2573 "rbd", "dir_get_name",
2574 image_id, image_id_size,
2575 (char *) reply_buf, size,
2576 CEPH_OSD_FLAG_READ, NULL);
2577 if (ret < 0)
2578 goto out;
2579 p = reply_buf;
2580 end = (char *) reply_buf + size;
2581 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2582 if (IS_ERR(image_name))
2583 image_name = NULL;
2584 else
2585 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2586out:
2587 kfree(reply_buf);
2588 kfree(image_id);
2589
2590 return image_name;
2591}
2592
2593/*
2594 * When a parent image gets probed, we only have the pool, image,
2595 * and snapshot ids but not the names of any of them. This call
2596 * is made later to fill in those names. It has to be done after
2597 * rbd_dev_snaps_update() has completed because some of the
2598 * information (in particular, snapshot name) is not available
2599 * until then.
2600 */
2601static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2602{
2603 struct ceph_osd_client *osdc;
2604 const char *name;
2605 void *reply_buf = NULL;
2606 int ret;
2607
2608 if (rbd_dev->spec->pool_name)
2609 return 0; /* Already have the names */
2610
2611 /* Look up the pool name */
2612
2613 osdc = &rbd_dev->rbd_client->client->osdc;
2614 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2615 if (!name)
2616 return -EIO; /* pool id too large (>= 2^31) */
2617
2618 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2619 if (!rbd_dev->spec->pool_name)
2620 return -ENOMEM;
2621
2622 /* Fetch the image name; tolerate failure here */
2623
2624 name = rbd_dev_image_name(rbd_dev);
2625 if (name) {
2626 rbd_dev->spec->image_name_len = strlen(name);
2627 rbd_dev->spec->image_name = (char *) name;
2628 } else {
2629 pr_warning(RBD_DRV_NAME "%d "
2630 "unable to get image name for image id %s\n",
2631 rbd_dev->major, rbd_dev->spec->image_id);
2632 }
2633
2634 /* Look up the snapshot name. */
2635
2636 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2637 if (!name) {
2638 ret = -EIO;
2639 goto out_err;
2640 }
2641 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2642 if(!rbd_dev->spec->snap_name)
2643 goto out_err;
2644
2645 return 0;
2646out_err:
2647 kfree(reply_buf);
2648 kfree(rbd_dev->spec->pool_name);
2649 rbd_dev->spec->pool_name = NULL;
2650
2651 return ret;
2652}
2653
Alex Elder6e14b1a2012-07-03 16:01:19 -05002654static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002655{
2656 size_t size;
2657 int ret;
2658 void *reply_buf;
2659 void *p;
2660 void *end;
2661 u64 seq;
2662 u32 snap_count;
2663 struct ceph_snap_context *snapc;
2664 u32 i;
2665
2666 /*
2667 * We'll need room for the seq value (maximum snapshot id),
2668 * snapshot count, and array of that many snapshot ids.
2669 * For now we have a fixed upper limit on the number we're
2670 * prepared to receive.
2671 */
2672 size = sizeof (__le64) + sizeof (__le32) +
2673 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2674 reply_buf = kzalloc(size, GFP_KERNEL);
2675 if (!reply_buf)
2676 return -ENOMEM;
2677
2678 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2679 "rbd", "get_snapcontext",
2680 NULL, 0,
2681 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002682 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002683 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2684 if (ret < 0)
2685 goto out;
2686
2687 ret = -ERANGE;
2688 p = reply_buf;
2689 end = (char *) reply_buf + size;
2690 ceph_decode_64_safe(&p, end, seq, out);
2691 ceph_decode_32_safe(&p, end, snap_count, out);
2692
2693 /*
2694 * Make sure the reported number of snapshot ids wouldn't go
2695 * beyond the end of our buffer. But before checking that,
2696 * make sure the computed size of the snapshot context we
2697 * allocate is representable in a size_t.
2698 */
2699 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2700 / sizeof (u64)) {
2701 ret = -EINVAL;
2702 goto out;
2703 }
2704 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2705 goto out;
2706
2707 size = sizeof (struct ceph_snap_context) +
2708 snap_count * sizeof (snapc->snaps[0]);
2709 snapc = kmalloc(size, GFP_KERNEL);
2710 if (!snapc) {
2711 ret = -ENOMEM;
2712 goto out;
2713 }
2714
2715 atomic_set(&snapc->nref, 1);
2716 snapc->seq = seq;
2717 snapc->num_snaps = snap_count;
2718 for (i = 0; i < snap_count; i++)
2719 snapc->snaps[i] = ceph_decode_64(&p);
2720
2721 rbd_dev->header.snapc = snapc;
2722
2723 dout(" snap context seq = %llu, snap_count = %u\n",
2724 (unsigned long long) seq, (unsigned int) snap_count);
2725
2726out:
2727 kfree(reply_buf);
2728
2729 return 0;
2730}
2731
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002732static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2733{
2734 size_t size;
2735 void *reply_buf;
2736 __le64 snap_id;
2737 int ret;
2738 void *p;
2739 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002740 char *snap_name;
2741
2742 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2743 reply_buf = kmalloc(size, GFP_KERNEL);
2744 if (!reply_buf)
2745 return ERR_PTR(-ENOMEM);
2746
2747 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2748 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2749 "rbd", "get_snapshot_name",
2750 (char *) &snap_id, sizeof (snap_id),
2751 reply_buf, size,
2752 CEPH_OSD_FLAG_READ, NULL);
2753 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2754 if (ret < 0)
2755 goto out;
2756
2757 p = reply_buf;
2758 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002759 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002760 if (IS_ERR(snap_name)) {
2761 ret = PTR_ERR(snap_name);
2762 goto out;
2763 } else {
2764 dout(" snap_id 0x%016llx snap_name = %s\n",
2765 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2766 }
2767 kfree(reply_buf);
2768
2769 return snap_name;
2770out:
2771 kfree(reply_buf);
2772
2773 return ERR_PTR(ret);
2774}
2775
2776static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2777 u64 *snap_size, u64 *snap_features)
2778{
2779 __le64 snap_id;
2780 u8 order;
2781 int ret;
2782
2783 snap_id = rbd_dev->header.snapc->snaps[which];
2784 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2785 if (ret)
2786 return ERR_PTR(ret);
2787 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2788 if (ret)
2789 return ERR_PTR(ret);
2790
2791 return rbd_dev_v2_snap_name(rbd_dev, which);
2792}
2793
2794static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2795 u64 *snap_size, u64 *snap_features)
2796{
2797 if (rbd_dev->image_format == 1)
2798 return rbd_dev_v1_snap_info(rbd_dev, which,
2799 snap_size, snap_features);
2800 if (rbd_dev->image_format == 2)
2801 return rbd_dev_v2_snap_info(rbd_dev, which,
2802 snap_size, snap_features);
2803 return ERR_PTR(-EINVAL);
2804}
2805
Alex Elder117973f2012-08-31 17:29:55 -05002806static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2807{
2808 int ret;
2809 __u8 obj_order;
2810
2811 down_write(&rbd_dev->header_rwsem);
2812
2813 /* Grab old order first, to see if it changes */
2814
2815 obj_order = rbd_dev->header.obj_order,
2816 ret = rbd_dev_v2_image_size(rbd_dev);
2817 if (ret)
2818 goto out;
2819 if (rbd_dev->header.obj_order != obj_order) {
2820 ret = -EIO;
2821 goto out;
2822 }
2823 rbd_update_mapping_size(rbd_dev);
2824
2825 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2826 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2827 if (ret)
2828 goto out;
2829 ret = rbd_dev_snaps_update(rbd_dev);
2830 dout("rbd_dev_snaps_update returned %d\n", ret);
2831 if (ret)
2832 goto out;
2833 ret = rbd_dev_snaps_register(rbd_dev);
2834 dout("rbd_dev_snaps_register returned %d\n", ret);
2835out:
2836 up_write(&rbd_dev->header_rwsem);
2837
2838 return ret;
2839}
2840
Alex Elder9d475de2012-07-03 16:01:19 -05002841/*
Alex Elder35938152012-08-02 11:29:46 -05002842 * Scan the rbd device's current snapshot list and compare it to the
2843 * newly-received snapshot context. Remove any existing snapshots
2844 * not present in the new snapshot context. Add a new snapshot for
2845 * any snaphots in the snapshot context not in the current list.
2846 * And verify there are no changes to snapshots we already know
2847 * about.
2848 *
2849 * Assumes the snapshots in the snapshot context are sorted by
2850 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2851 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002852 */
Alex Elder304f6802012-08-31 17:29:52 -05002853static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002854{
Alex Elder35938152012-08-02 11:29:46 -05002855 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2856 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002857 struct list_head *head = &rbd_dev->snaps;
2858 struct list_head *links = head->next;
2859 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002860
Alex Elder9fcbb802012-08-23 23:48:49 -05002861 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002862 while (index < snap_count || links != head) {
2863 u64 snap_id;
2864 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002865 char *snap_name;
2866 u64 snap_size = 0;
2867 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002868
Alex Elder35938152012-08-02 11:29:46 -05002869 snap_id = index < snap_count ? snapc->snaps[index]
2870 : CEPH_NOSNAP;
2871 snap = links != head ? list_entry(links, struct rbd_snap, node)
2872 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002873 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002874
Alex Elder35938152012-08-02 11:29:46 -05002875 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2876 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002877
Alex Elder35938152012-08-02 11:29:46 -05002878 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002879
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002880 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderdaba5fd2012-10-26 17:25:23 -05002881 rbd_dev->exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002882 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002883 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002884 rbd_dev->spec->snap_id == snap->id ?
2885 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002886 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002887
Alex Elder35938152012-08-02 11:29:46 -05002888 /* Done with this list entry; advance */
2889
2890 links = next;
2891 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002892 }
Alex Elder35938152012-08-02 11:29:46 -05002893
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002894 snap_name = rbd_dev_snap_info(rbd_dev, index,
2895 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002896 if (IS_ERR(snap_name))
2897 return PTR_ERR(snap_name);
2898
Alex Elder9fcbb802012-08-23 23:48:49 -05002899 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2900 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002901 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2902 struct rbd_snap *new_snap;
2903
2904 /* We haven't seen this snapshot before */
2905
Alex Elderc8d18422012-07-10 20:30:11 -05002906 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002907 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002908 if (IS_ERR(new_snap)) {
2909 int err = PTR_ERR(new_snap);
2910
2911 dout(" failed to add dev, error %d\n", err);
2912
2913 return err;
2914 }
Alex Elder35938152012-08-02 11:29:46 -05002915
2916 /* New goes before existing, or at end of list */
2917
Alex Elder9fcbb802012-08-23 23:48:49 -05002918 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002919 if (snap)
2920 list_add_tail(&new_snap->node, &snap->node);
2921 else
Alex Elder523f3252012-08-30 00:16:37 -05002922 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002923 } else {
2924 /* Already have this one */
2925
Alex Elder9fcbb802012-08-23 23:48:49 -05002926 dout(" already present\n");
2927
Alex Eldercd892122012-07-03 16:01:19 -05002928 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002929 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002930 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002931
2932 /* Done with this list entry; advance */
2933
2934 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002935 }
Alex Elder35938152012-08-02 11:29:46 -05002936
2937 /* Advance to the next entry in the snapshot context */
2938
2939 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002940 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002941 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002942
2943 return 0;
2944}
2945
Alex Elder304f6802012-08-31 17:29:52 -05002946/*
2947 * Scan the list of snapshots and register the devices for any that
2948 * have not already been registered.
2949 */
2950static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2951{
2952 struct rbd_snap *snap;
2953 int ret = 0;
2954
2955 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002956 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2957 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002958
2959 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2960 if (!rbd_snap_registered(snap)) {
2961 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2962 if (ret < 0)
2963 break;
2964 }
2965 }
2966 dout("%s: returning %d\n", __func__, ret);
2967
2968 return ret;
2969}
2970
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002971static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2972{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002973 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002974 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002975
2976 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002977
Alex Eldercd789ab2012-08-30 00:16:38 -05002978 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002979 dev->bus = &rbd_bus_type;
2980 dev->type = &rbd_device_type;
2981 dev->parent = &rbd_root_dev;
2982 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002983 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002984 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002985
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002986 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002987
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002988 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002989}
2990
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002991static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2992{
2993 device_unregister(&rbd_dev->dev);
2994}
2995
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002996static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2997{
2998 int ret, rc;
2999
3000 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05003001 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003002 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05003003 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003004 if (rc < 0)
3005 return rc;
3006 }
3007 } while (ret == -ERANGE);
3008
3009 return ret;
3010}
3011
Alex Eldere2839302012-08-29 17:11:06 -05003012static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003013
3014/*
Alex Elder499afd52012-02-02 08:13:29 -06003015 * Get a unique rbd identifier for the given new rbd_dev, and add
3016 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003017 */
Alex Eldere2839302012-08-29 17:11:06 -05003018static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003019{
Alex Eldere2839302012-08-29 17:11:06 -05003020 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003021
3022 spin_lock(&rbd_dev_list_lock);
3023 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3024 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003025 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3026 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003027}
Alex Elderb7f23c32012-01-29 13:57:43 -06003028
Alex Elder1ddbe942012-01-29 13:57:44 -06003029/*
Alex Elder499afd52012-02-02 08:13:29 -06003030 * Remove an rbd_dev from the global list, and record that its
3031 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003032 */
Alex Eldere2839302012-08-29 17:11:06 -05003033static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003034{
Alex Elderd184f6b2012-01-29 13:57:44 -06003035 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003036 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003037 int max_id;
3038
Alex Elderaafb2302012-09-06 16:00:54 -05003039 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003040
Alex Eldere2839302012-08-29 17:11:06 -05003041 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3042 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003043 spin_lock(&rbd_dev_list_lock);
3044 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003045
3046 /*
3047 * If the id being "put" is not the current maximum, there
3048 * is nothing special we need to do.
3049 */
Alex Eldere2839302012-08-29 17:11:06 -05003050 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003051 spin_unlock(&rbd_dev_list_lock);
3052 return;
3053 }
3054
3055 /*
3056 * We need to update the current maximum id. Search the
3057 * list to find out what it is. We're more likely to find
3058 * the maximum at the end, so search the list backward.
3059 */
3060 max_id = 0;
3061 list_for_each_prev(tmp, &rbd_dev_list) {
3062 struct rbd_device *rbd_dev;
3063
3064 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003065 if (rbd_dev->dev_id > max_id)
3066 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003067 }
Alex Elder499afd52012-02-02 08:13:29 -06003068 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003069
Alex Elder1ddbe942012-01-29 13:57:44 -06003070 /*
Alex Eldere2839302012-08-29 17:11:06 -05003071 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003072 * which case it now accurately reflects the new maximum.
3073 * Be careful not to overwrite the maximum value in that
3074 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003075 */
Alex Eldere2839302012-08-29 17:11:06 -05003076 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3077 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003078}
3079
Alex Eldera725f65e2012-02-02 08:13:30 -06003080/*
Alex Eldere28fff262012-02-02 08:13:30 -06003081 * Skips over white space at *buf, and updates *buf to point to the
3082 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003083 * the token (string of non-white space characters) found. Note
3084 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003085 */
3086static inline size_t next_token(const char **buf)
3087{
3088 /*
3089 * These are the characters that produce nonzero for
3090 * isspace() in the "C" and "POSIX" locales.
3091 */
3092 const char *spaces = " \f\n\r\t\v";
3093
3094 *buf += strspn(*buf, spaces); /* Find start of token */
3095
3096 return strcspn(*buf, spaces); /* Return token length */
3097}
3098
3099/*
3100 * Finds the next token in *buf, and if the provided token buffer is
3101 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003102 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3103 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003104 *
3105 * Returns the length of the token found (not including the '\0').
3106 * Return value will be 0 if no token is found, and it will be >=
3107 * token_size if the token would not fit.
3108 *
Alex Elder593a9e72012-02-07 12:03:37 -06003109 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003110 * found token. Note that this occurs even if the token buffer is
3111 * too small to hold it.
3112 */
3113static inline size_t copy_token(const char **buf,
3114 char *token,
3115 size_t token_size)
3116{
3117 size_t len;
3118
3119 len = next_token(buf);
3120 if (len < token_size) {
3121 memcpy(token, *buf, len);
3122 *(token + len) = '\0';
3123 }
3124 *buf += len;
3125
3126 return len;
3127}
3128
3129/*
Alex Elderea3352f2012-07-09 21:04:23 -05003130 * Finds the next token in *buf, dynamically allocates a buffer big
3131 * enough to hold a copy of it, and copies the token into the new
3132 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3133 * that a duplicate buffer is created even for a zero-length token.
3134 *
3135 * Returns a pointer to the newly-allocated duplicate, or a null
3136 * pointer if memory for the duplicate was not available. If
3137 * the lenp argument is a non-null pointer, the length of the token
3138 * (not including the '\0') is returned in *lenp.
3139 *
3140 * If successful, the *buf pointer will be updated to point beyond
3141 * the end of the found token.
3142 *
3143 * Note: uses GFP_KERNEL for allocation.
3144 */
3145static inline char *dup_token(const char **buf, size_t *lenp)
3146{
3147 char *dup;
3148 size_t len;
3149
3150 len = next_token(buf);
3151 dup = kmalloc(len + 1, GFP_KERNEL);
3152 if (!dup)
3153 return NULL;
3154
3155 memcpy(dup, *buf, len);
3156 *(dup + len) = '\0';
3157 *buf += len;
3158
3159 if (lenp)
3160 *lenp = len;
3161
3162 return dup;
3163}
3164
3165/*
Alex Elder859c31d2012-10-25 23:34:42 -05003166 * Parse the options provided for an "rbd add" (i.e., rbd image
3167 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3168 * and the data written is passed here via a NUL-terminated buffer.
3169 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003170 *
Alex Elder859c31d2012-10-25 23:34:42 -05003171 * The information extracted from these options is recorded in
3172 * the other parameters which return dynamically-allocated
3173 * structures:
3174 * ceph_opts
3175 * The address of a pointer that will refer to a ceph options
3176 * structure. Caller must release the returned pointer using
3177 * ceph_destroy_options() when it is no longer needed.
3178 * rbd_opts
3179 * Address of an rbd options pointer. Fully initialized by
3180 * this function; caller must release with kfree().
3181 * spec
3182 * Address of an rbd image specification pointer. Fully
3183 * initialized by this function based on parsed options.
3184 * Caller must release with rbd_spec_put().
3185 *
3186 * The options passed take this form:
3187 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3188 * where:
3189 * <mon_addrs>
3190 * A comma-separated list of one or more monitor addresses.
3191 * A monitor address is an ip address, optionally followed
3192 * by a port number (separated by a colon).
3193 * I.e.: ip1[:port1][,ip2[:port2]...]
3194 * <options>
3195 * A comma-separated list of ceph and/or rbd options.
3196 * <pool_name>
3197 * The name of the rados pool containing the rbd image.
3198 * <image_name>
3199 * The name of the image in that pool to map.
3200 * <snap_id>
3201 * An optional snapshot id. If provided, the mapping will
3202 * present data from the image at the time that snapshot was
3203 * created. The image head is used if no snapshot id is
3204 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003205 */
Alex Elder859c31d2012-10-25 23:34:42 -05003206static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003207 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003208 struct rbd_options **opts,
3209 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003210{
Alex Elderd22f76e2012-07-12 10:46:35 -05003211 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003212 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003213 const char *mon_addrs;
3214 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003215 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003216 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003217 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003218 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003219
3220 /* The first four tokens are required */
3221
Alex Elder7ef32142012-02-02 08:13:30 -06003222 len = next_token(&buf);
3223 if (!len)
Alex Elderdc79b112012-10-25 23:34:41 -05003224 return -EINVAL; /* Missing monitor address(es) */
Alex Elder0ddebc02012-10-25 23:34:41 -05003225 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003226 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003227 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003228
Alex Elderdc79b112012-10-25 23:34:41 -05003229 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003230 options = dup_token(&buf, NULL);
3231 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003232 return -ENOMEM;
Alex Elderf28e5652012-10-25 23:34:41 -05003233 if (!*options)
3234 goto out_err; /* Missing options */
Alex Eldera725f65e2012-02-02 08:13:30 -06003235
Alex Elder859c31d2012-10-25 23:34:42 -05003236 spec = rbd_spec_alloc();
3237 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003238 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003239
3240 spec->pool_name = dup_token(&buf, NULL);
3241 if (!spec->pool_name)
3242 goto out_mem;
3243 if (!*spec->pool_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003244 goto out_err; /* Missing pool name */
Alex Eldere28fff262012-02-02 08:13:30 -06003245
Alex Elder859c31d2012-10-25 23:34:42 -05003246 spec->image_name = dup_token(&buf, &spec->image_name_len);
3247 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003248 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003249 if (!*spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003250 goto out_err; /* Missing image name */
Alex Eldere28fff262012-02-02 08:13:30 -06003251
Alex Elderf28e5652012-10-25 23:34:41 -05003252 /*
3253 * Snapshot name is optional; default is to use "-"
3254 * (indicating the head/no snapshot).
3255 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003256 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003257 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003258 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3259 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003260 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003261 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003262 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003263 }
Alex Elder859c31d2012-10-25 23:34:42 -05003264 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3265 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003266 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003267 memcpy(spec->snap_name, buf, len);
3268 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003269
Alex Elder0ddebc02012-10-25 23:34:41 -05003270 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003271
Alex Elder4e9afeb2012-10-25 23:34:41 -05003272 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3273 if (!rbd_opts)
3274 goto out_mem;
3275
3276 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003277
Alex Elder859c31d2012-10-25 23:34:42 -05003278 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003279 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003280 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003281 if (IS_ERR(copts)) {
3282 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003283 goto out_err;
3284 }
Alex Elder859c31d2012-10-25 23:34:42 -05003285 kfree(options);
3286
3287 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003288 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003289 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003290
Alex Elderdc79b112012-10-25 23:34:41 -05003291 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003292out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003293 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003294out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003295 kfree(rbd_opts);
3296 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003297 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003298
Alex Elderdc79b112012-10-25 23:34:41 -05003299 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003300}
3301
Alex Elder589d30e2012-07-10 20:30:11 -05003302/*
3303 * An rbd format 2 image has a unique identifier, distinct from the
3304 * name given to it by the user. Internally, that identifier is
3305 * what's used to specify the names of objects related to the image.
3306 *
3307 * A special "rbd id" object is used to map an rbd image name to its
3308 * id. If that object doesn't exist, then there is no v2 rbd image
3309 * with the supplied name.
3310 *
3311 * This function will record the given rbd_dev's image_id field if
3312 * it can be determined, and in that case will return 0. If any
3313 * errors occur a negative errno will be returned and the rbd_dev's
3314 * image_id field will be unchanged (and should be NULL).
3315 */
3316static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3317{
3318 int ret;
3319 size_t size;
3320 char *object_name;
3321 void *response;
3322 void *p;
3323
3324 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003325 * When probing a parent image, the image id is already
3326 * known (and the image name likely is not). There's no
3327 * need to fetch the image id again in this case.
3328 */
3329 if (rbd_dev->spec->image_id)
3330 return 0;
3331
3332 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003333 * First, see if the format 2 image id file exists, and if
3334 * so, get the image's persistent id from it.
3335 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003336 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
Alex Elder589d30e2012-07-10 20:30:11 -05003337 object_name = kmalloc(size, GFP_NOIO);
3338 if (!object_name)
3339 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003340 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003341 dout("rbd id object name is %s\n", object_name);
3342
3343 /* Response will be an encoded string, which includes a length */
3344
3345 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3346 response = kzalloc(size, GFP_NOIO);
3347 if (!response) {
3348 ret = -ENOMEM;
3349 goto out;
3350 }
3351
3352 ret = rbd_req_sync_exec(rbd_dev, object_name,
3353 "rbd", "get_id",
3354 NULL, 0,
3355 response, RBD_IMAGE_ID_LEN_MAX,
3356 CEPH_OSD_FLAG_READ, NULL);
3357 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3358 if (ret < 0)
3359 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003360 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003361
3362 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003363 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003364 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003365 &rbd_dev->spec->image_id_len,
Alex Elder589d30e2012-07-10 20:30:11 -05003366 GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003367 if (IS_ERR(rbd_dev->spec->image_id)) {
3368 ret = PTR_ERR(rbd_dev->spec->image_id);
3369 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003370 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003371 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003372 }
3373out:
3374 kfree(response);
3375 kfree(object_name);
3376
3377 return ret;
3378}
3379
Alex Eldera30b71b2012-07-10 20:30:11 -05003380static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3381{
3382 int ret;
3383 size_t size;
3384
3385 /* Version 1 images have no id; empty string is used */
3386
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003387 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3388 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003389 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003390 rbd_dev->spec->image_id_len = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003391
3392 /* Record the header object name for this rbd image. */
3393
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003394 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003395 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3396 if (!rbd_dev->header_name) {
3397 ret = -ENOMEM;
3398 goto out_err;
3399 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003400 sprintf(rbd_dev->header_name, "%s%s",
3401 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003402
3403 /* Populate rbd image metadata */
3404
3405 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3406 if (ret < 0)
3407 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003408
3409 /* Version 1 images have no parent (no layering) */
3410
3411 rbd_dev->parent_spec = NULL;
3412 rbd_dev->parent_overlap = 0;
3413
Alex Eldera30b71b2012-07-10 20:30:11 -05003414 rbd_dev->image_format = 1;
3415
3416 dout("discovered version 1 image, header name is %s\n",
3417 rbd_dev->header_name);
3418
3419 return 0;
3420
3421out_err:
3422 kfree(rbd_dev->header_name);
3423 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003424 kfree(rbd_dev->spec->image_id);
3425 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003426
3427 return ret;
3428}
3429
3430static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3431{
3432 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003433 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003434 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003435
3436 /*
3437 * Image id was filled in by the caller. Record the header
3438 * object name for this rbd image.
3439 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003440 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
Alex Eldera30b71b2012-07-10 20:30:11 -05003441 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3442 if (!rbd_dev->header_name)
3443 return -ENOMEM;
3444 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003445 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003446
3447 /* Get the size and object order for the image */
3448
3449 ret = rbd_dev_v2_image_size(rbd_dev);
3450 if (ret < 0)
3451 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003452
3453 /* Get the object prefix (a.k.a. block_name) for the image */
3454
3455 ret = rbd_dev_v2_object_prefix(rbd_dev);
3456 if (ret < 0)
3457 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003458
Alex Elderd8891402012-10-09 13:50:17 -07003459 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003460
3461 ret = rbd_dev_v2_features(rbd_dev);
3462 if (ret < 0)
3463 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003464
Alex Elder86b00e02012-10-25 23:34:42 -05003465 /* If the image supports layering, get the parent info */
3466
3467 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3468 ret = rbd_dev_v2_parent_info(rbd_dev);
3469 if (ret < 0)
3470 goto out_err;
3471 }
3472
Alex Elder6e14b1a2012-07-03 16:01:19 -05003473 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003474
Alex Elder6e14b1a2012-07-03 16:01:19 -05003475 rbd_dev->header.crypt_type = 0;
3476 rbd_dev->header.comp_type = 0;
3477
3478 /* Get the snapshot context, plus the header version */
3479
3480 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003481 if (ret)
3482 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003483 rbd_dev->header.obj_version = ver;
3484
Alex Eldera30b71b2012-07-10 20:30:11 -05003485 rbd_dev->image_format = 2;
3486
3487 dout("discovered version 2 image, header name is %s\n",
3488 rbd_dev->header_name);
3489
Alex Elder35152972012-08-31 17:29:55 -05003490 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003491out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003492 rbd_dev->parent_overlap = 0;
3493 rbd_spec_put(rbd_dev->parent_spec);
3494 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003495 kfree(rbd_dev->header_name);
3496 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003497 kfree(rbd_dev->header.object_prefix);
3498 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003499
3500 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003501}
3502
Alex Elder83a06262012-10-30 15:47:17 -05003503static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3504{
3505 int ret;
3506
3507 /* no need to lock here, as rbd_dev is not registered yet */
3508 ret = rbd_dev_snaps_update(rbd_dev);
3509 if (ret)
3510 return ret;
3511
Alex Elder9e15b772012-10-30 19:40:33 -05003512 ret = rbd_dev_probe_update_spec(rbd_dev);
3513 if (ret)
3514 goto err_out_snaps;
3515
Alex Elder83a06262012-10-30 15:47:17 -05003516 ret = rbd_dev_set_mapping(rbd_dev);
3517 if (ret)
3518 goto err_out_snaps;
3519
3520 /* generate unique id: find highest unique id, add one */
3521 rbd_dev_id_get(rbd_dev);
3522
3523 /* Fill in the device name, now that we have its id. */
3524 BUILD_BUG_ON(DEV_NAME_LEN
3525 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3526 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3527
3528 /* Get our block major device number. */
3529
3530 ret = register_blkdev(0, rbd_dev->name);
3531 if (ret < 0)
3532 goto err_out_id;
3533 rbd_dev->major = ret;
3534
3535 /* Set up the blkdev mapping. */
3536
3537 ret = rbd_init_disk(rbd_dev);
3538 if (ret)
3539 goto err_out_blkdev;
3540
3541 ret = rbd_bus_add_dev(rbd_dev);
3542 if (ret)
3543 goto err_out_disk;
3544
3545 /*
3546 * At this point cleanup in the event of an error is the job
3547 * of the sysfs code (initiated by rbd_bus_del_dev()).
3548 */
3549 down_write(&rbd_dev->header_rwsem);
3550 ret = rbd_dev_snaps_register(rbd_dev);
3551 up_write(&rbd_dev->header_rwsem);
3552 if (ret)
3553 goto err_out_bus;
3554
3555 ret = rbd_init_watch_dev(rbd_dev);
3556 if (ret)
3557 goto err_out_bus;
3558
3559 /* Everything's ready. Announce the disk to the world. */
3560
3561 add_disk(rbd_dev->disk);
3562
3563 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3564 (unsigned long long) rbd_dev->mapping.size);
3565
3566 return ret;
3567err_out_bus:
3568 /* this will also clean up rest of rbd_dev stuff */
3569
3570 rbd_bus_del_dev(rbd_dev);
3571
3572 return ret;
3573err_out_disk:
3574 rbd_free_disk(rbd_dev);
3575err_out_blkdev:
3576 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3577err_out_id:
3578 rbd_dev_id_put(rbd_dev);
3579err_out_snaps:
3580 rbd_remove_all_snaps(rbd_dev);
3581
3582 return ret;
3583}
3584
Alex Eldera30b71b2012-07-10 20:30:11 -05003585/*
3586 * Probe for the existence of the header object for the given rbd
3587 * device. For format 2 images this includes determining the image
3588 * id.
3589 */
3590static int rbd_dev_probe(struct rbd_device *rbd_dev)
3591{
3592 int ret;
3593
3594 /*
3595 * Get the id from the image id object. If it's not a
3596 * format 2 image, we'll get ENOENT back, and we'll assume
3597 * it's a format 1 image.
3598 */
3599 ret = rbd_dev_image_id(rbd_dev);
3600 if (ret)
3601 ret = rbd_dev_v1_probe(rbd_dev);
3602 else
3603 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003604 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003605 dout("probe failed, returning %d\n", ret);
3606
Alex Elder83a06262012-10-30 15:47:17 -05003607 return ret;
3608 }
3609
3610 ret = rbd_dev_probe_finish(rbd_dev);
3611 if (ret)
3612 rbd_header_free(&rbd_dev->header);
3613
Alex Eldera30b71b2012-07-10 20:30:11 -05003614 return ret;
3615}
3616
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003617static ssize_t rbd_add(struct bus_type *bus,
3618 const char *buf,
3619 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003620{
Alex Eldercb8627c2012-07-09 21:04:23 -05003621 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003622 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003623 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003624 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003625 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003626 struct ceph_osd_client *osdc;
3627 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003628
3629 if (!try_module_get(THIS_MODULE))
3630 return -ENODEV;
3631
Alex Eldera725f65e2012-02-02 08:13:30 -06003632 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003633 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003634 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003635 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003636
Alex Elder9d3997f2012-10-25 23:34:42 -05003637 rbdc = rbd_get_client(ceph_opts);
3638 if (IS_ERR(rbdc)) {
3639 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003640 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003641 }
Alex Elderc53d5892012-10-25 23:34:42 -05003642 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003643
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003644 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003645 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003646 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003647 if (rc < 0)
3648 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003649 spec->pool_id = (u64) rc;
3650
Alex Elderc53d5892012-10-25 23:34:42 -05003651 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003652 if (!rbd_dev)
3653 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05003654 rbdc = NULL; /* rbd_dev now owns this */
3655 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003656
Alex Elderbd4ba652012-10-25 23:34:42 -05003657 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05003658 kfree(rbd_opts);
3659 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05003660
Alex Eldera30b71b2012-07-10 20:30:11 -05003661 rc = rbd_dev_probe(rbd_dev);
3662 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05003663 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05003664
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003665 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05003666err_out_rbd_dev:
3667 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05003668err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05003669 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003670err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003671 if (ceph_opts)
3672 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003673 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003674 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003675err_out_module:
3676 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003677
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003678 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06003679
3680 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003681}
3682
Alex Elderde71a292012-07-03 16:01:19 -05003683static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003684{
3685 struct list_head *tmp;
3686 struct rbd_device *rbd_dev;
3687
Alex Eldere124a82f2012-01-29 13:57:44 -06003688 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003689 list_for_each(tmp, &rbd_dev_list) {
3690 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003691 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06003692 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003693 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06003694 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003695 }
Alex Eldere124a82f2012-01-29 13:57:44 -06003696 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003697 return NULL;
3698}
3699
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003700static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003701{
Alex Elder593a9e72012-02-07 12:03:37 -06003702 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003703
Alex Elder1dbb4392012-01-24 10:08:37 -06003704 if (rbd_dev->watch_request) {
3705 struct ceph_client *client = rbd_dev->rbd_client->client;
3706
3707 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003708 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003709 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003710 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003711 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003712
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003713
3714 /* clean up and free blkdev */
3715 rbd_free_disk(rbd_dev);
3716 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003717
Alex Elder2ac4e752012-07-10 20:30:10 -05003718 /* release allocated disk header fields */
3719 rbd_header_free(&rbd_dev->header);
3720
Alex Elder32eec682012-02-08 16:11:14 -06003721 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05003722 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05003723 rbd_assert(rbd_dev->rbd_client != NULL);
3724 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003725
3726 /* release module ref */
3727 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003728}
3729
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003730static ssize_t rbd_remove(struct bus_type *bus,
3731 const char *buf,
3732 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003733{
3734 struct rbd_device *rbd_dev = NULL;
3735 int target_id, rc;
3736 unsigned long ul;
3737 int ret = count;
3738
3739 rc = strict_strtoul(buf, 10, &ul);
3740 if (rc)
3741 return rc;
3742
3743 /* convert to int; abort if we lost anything in the conversion */
3744 target_id = (int) ul;
3745 if (target_id != ul)
3746 return -EINVAL;
3747
3748 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3749
3750 rbd_dev = __rbd_get_dev(target_id);
3751 if (!rbd_dev) {
3752 ret = -ENOENT;
3753 goto done;
3754 }
3755
Alex Elder42382b72012-11-16 09:29:16 -06003756 if (rbd_dev->open_count) {
3757 ret = -EBUSY;
3758 goto done;
3759 }
3760
Alex Elder41f38c22012-10-25 23:34:40 -05003761 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003762 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003763
3764done:
3765 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003766
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003767 return ret;
3768}
3769
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003770/*
3771 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003772 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003773 */
3774static int rbd_sysfs_init(void)
3775{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003776 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003777
Alex Elderfed4c142012-02-07 12:03:36 -06003778 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003779 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003780 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003781
Alex Elderfed4c142012-02-07 12:03:36 -06003782 ret = bus_register(&rbd_bus_type);
3783 if (ret < 0)
3784 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003785
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003786 return ret;
3787}
3788
3789static void rbd_sysfs_cleanup(void)
3790{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003791 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003792 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003793}
3794
3795int __init rbd_init(void)
3796{
3797 int rc;
3798
3799 rc = rbd_sysfs_init();
3800 if (rc)
3801 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003802 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003803 return 0;
3804}
3805
3806void __exit rbd_exit(void)
3807{
3808 rbd_sysfs_cleanup();
3809}
3810
3811module_init(rbd_init);
3812module_exit(rbd_exit);
3813
3814MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3815MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3816MODULE_DESCRIPTION("rados block device");
3817
3818/* following authorship retained from original osdblk.c */
3819MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3820
3821MODULE_LICENSE("GPL");