blob: b8b8271bd9e22564f19f51956f5dff8c8ffef63a [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder1e130192012-07-03 16:01:19 -050069#define RBD_IMAGE_ID_LEN_MAX 64
70#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050071
Alex Elder81a89792012-02-02 08:13:30 -060072/*
73 * An RBD device name will be "rbd#", where the "rbd" comes from
74 * RBD_DRV_NAME above, and # is a unique integer identifier.
75 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
76 * enough to hold all possible device names.
77 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070078#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060079#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080
Alex Eldercc0538b2012-08-10 13:12:07 -070081#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070082
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083/*
84 * block device image metadata (in-memory version)
85 */
86struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050087 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050088 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -050089 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090 __u8 obj_order;
91 __u8 crypt_type;
92 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093
Alex Elderf84344f2012-08-31 17:29:51 -050094 /* The remaining fields need to be updated occasionally */
95 u64 image_size;
96 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097 char *snap_names;
98 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070099
100 u64 obj_version;
101};
102
103struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700104 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
110struct rbd_client {
111 struct ceph_client *client;
112 struct kref kref;
113 struct list_head node;
114};
115
116/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600117 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700118 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700119struct rbd_req_status {
120 int done;
121 int rc;
122 u64 bytes;
123};
124
125/*
126 * a collection of requests
127 */
128struct rbd_req_coll {
129 int total;
130 int num_done;
131 struct kref kref;
132 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700133};
134
Alex Elderf0f8cef2012-01-29 13:57:44 -0600135/*
136 * a single io request
137 */
138struct rbd_request {
139 struct request *rq; /* blk layer request */
140 struct bio *bio; /* cloned bio */
141 struct page **pages; /* list of used pages */
142 u64 len;
143 int coll_index;
144 struct rbd_req_coll *coll;
145};
146
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800147struct rbd_snap {
148 struct device dev;
149 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800150 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800151 struct list_head node;
152 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500153 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800154};
155
Alex Elderf84344f2012-08-31 17:29:51 -0500156struct rbd_mapping {
157 char *snap_name;
158 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500159 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500160 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500161 bool snap_exists;
162 bool read_only;
163};
164
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700165/*
166 * a single device
167 */
168struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500169 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700170
171 int major; /* blkdev assigned major */
172 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700173
Alex Eldera30b71b2012-07-10 20:30:11 -0500174 u32 image_format; /* Either 1 or 2 */
Alex Elderf8c38922012-08-10 13:12:07 -0700175 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700176 struct rbd_client *rbd_client;
177
178 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
179
180 spinlock_t lock; /* queue lock */
181
182 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500183 char *image_id;
184 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500185 char *image_name;
186 size_t image_name_len;
187 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500188 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500189 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700190
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700191 struct ceph_osd_event *watch_event;
192 struct ceph_osd_request *watch_request;
193
Josh Durginc6666012011-11-21 17:11:12 -0800194 /* protects updating the header */
195 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500196
197 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198
199 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800200
201 /* list of snapshots */
202 struct list_head snaps;
203
204 /* sysfs related */
205 struct device dev;
206};
207
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700208static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600209
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700210static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600211static DEFINE_SPINLOCK(rbd_dev_list_lock);
212
Alex Elder432b8582012-01-29 13:57:44 -0600213static LIST_HEAD(rbd_client_list); /* clients */
214static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700215
Alex Elder304f6802012-08-31 17:29:52 -0500216static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
217static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
218
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800219static void rbd_dev_release(struct device *dev);
Alex Elder14e70852012-07-19 09:09:27 -0500220static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800221
Alex Elderf0f8cef2012-01-29 13:57:44 -0600222static ssize_t rbd_add(struct bus_type *bus, const char *buf,
223 size_t count);
224static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
225 size_t count);
226
227static struct bus_attribute rbd_bus_attrs[] = {
228 __ATTR(add, S_IWUSR, NULL, rbd_add),
229 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
230 __ATTR_NULL
231};
232
233static struct bus_type rbd_bus_type = {
234 .name = "rbd",
235 .bus_attrs = rbd_bus_attrs,
236};
237
238static void rbd_root_dev_release(struct device *dev)
239{
240}
241
242static struct device rbd_root_dev = {
243 .init_name = "rbd",
244 .release = rbd_root_dev_release,
245};
246
Alex Elderaafb2302012-09-06 16:00:54 -0500247#ifdef RBD_DEBUG
248#define rbd_assert(expr) \
249 if (unlikely(!(expr))) { \
250 printk(KERN_ERR "\nAssertion failure in %s() " \
251 "at line %d:\n\n" \
252 "\trbd_assert(%s);\n\n", \
253 __func__, __LINE__, #expr); \
254 BUG(); \
255 }
256#else /* !RBD_DEBUG */
257# define rbd_assert(expr) ((void) 0)
258#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800259
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800260static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
261{
262 return get_device(&rbd_dev->dev);
263}
264
265static void rbd_put_dev(struct rbd_device *rbd_dev)
266{
267 put_device(&rbd_dev->dev);
268}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269
Alex Elder1fe5e992012-07-25 09:32:41 -0500270static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700271
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700272static int rbd_open(struct block_device *bdev, fmode_t mode)
273{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600274 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700275
Alex Elderf84344f2012-08-31 17:29:51 -0500276 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700277 return -EROFS;
278
Alex Elder340c7a22012-08-10 13:12:07 -0700279 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500280 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700281
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700282 return 0;
283}
284
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800285static int rbd_release(struct gendisk *disk, fmode_t mode)
286{
287 struct rbd_device *rbd_dev = disk->private_data;
288
289 rbd_put_dev(rbd_dev);
290
291 return 0;
292}
293
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294static const struct block_device_operations rbd_bd_ops = {
295 .owner = THIS_MODULE,
296 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800297 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700298};
299
300/*
301 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500302 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700303 */
Alex Elderf8c38922012-08-10 13:12:07 -0700304static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305{
306 struct rbd_client *rbdc;
307 int ret = -ENOMEM;
308
309 dout("rbd_client_create\n");
310 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
311 if (!rbdc)
312 goto out_opt;
313
314 kref_init(&rbdc->kref);
315 INIT_LIST_HEAD(&rbdc->node);
316
Alex Elderbc534d82012-01-29 13:57:44 -0600317 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
318
Alex Elder43ae4702012-07-03 16:01:18 -0500319 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700320 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600321 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500322 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700323
324 ret = ceph_open_session(rbdc->client);
325 if (ret < 0)
326 goto out_err;
327
Alex Elder432b8582012-01-29 13:57:44 -0600328 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700329 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600330 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700331
Alex Elderbc534d82012-01-29 13:57:44 -0600332 mutex_unlock(&ctl_mutex);
333
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 dout("rbd_client_create created %p\n", rbdc);
335 return rbdc;
336
337out_err:
338 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600339out_mutex:
340 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700341 kfree(rbdc);
342out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500343 if (ceph_opts)
344 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400345 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346}
347
348/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700349 * Find a ceph client with specific addr and configuration. If
350 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700352static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700353{
354 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700355 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356
Alex Elder43ae4702012-07-03 16:01:18 -0500357 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358 return NULL;
359
Alex Elder1f7ba332012-08-10 13:12:07 -0700360 spin_lock(&rbd_client_list_lock);
361 list_for_each_entry(client_node, &rbd_client_list, node) {
362 if (!ceph_compare_options(ceph_opts, client_node->client)) {
363 kref_get(&client_node->kref);
364 found = true;
365 break;
366 }
367 }
368 spin_unlock(&rbd_client_list_lock);
369
370 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371}
372
373/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700374 * mount options
375 */
376enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700377 Opt_last_int,
378 /* int args above */
379 Opt_last_string,
380 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700381 Opt_read_only,
382 Opt_read_write,
383 /* Boolean args above */
384 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700385};
386
Alex Elder43ae4702012-07-03 16:01:18 -0500387static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700388 /* int args above */
389 /* string args above */
Alex Elderf84344f2012-08-31 17:29:51 -0500390 {Opt_read_only, "mapping.read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700391 {Opt_read_only, "ro"}, /* Alternate spelling */
392 {Opt_read_write, "read_write"},
393 {Opt_read_write, "rw"}, /* Alternate spelling */
394 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700395 {-1, NULL}
396};
397
398static int parse_rbd_opts_token(char *c, void *private)
399{
Alex Elder43ae4702012-07-03 16:01:18 -0500400 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700401 substring_t argstr[MAX_OPT_ARGS];
402 int token, intval, ret;
403
Alex Elder43ae4702012-07-03 16:01:18 -0500404 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700405 if (token < 0)
406 return -EINVAL;
407
408 if (token < Opt_last_int) {
409 ret = match_int(&argstr[0], &intval);
410 if (ret < 0) {
411 pr_err("bad mount option arg (not int) "
412 "at '%s'\n", c);
413 return ret;
414 }
415 dout("got int token %d val %d\n", token, intval);
416 } else if (token > Opt_last_int && token < Opt_last_string) {
417 dout("got string token %d val %s\n", token,
418 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700419 } else if (token > Opt_last_string && token < Opt_last_bool) {
420 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700421 } else {
422 dout("got token %d\n", token);
423 }
424
425 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700426 case Opt_read_only:
427 rbd_opts->read_only = true;
428 break;
429 case Opt_read_write:
430 rbd_opts->read_only = false;
431 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700432 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500433 rbd_assert(false);
434 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700435 }
436 return 0;
437}
438
439/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440 * Get a ceph client with specific addr and configuration, if one does
441 * not exist create it.
442 */
Alex Elderf8c38922012-08-10 13:12:07 -0700443static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
444 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445{
Alex Elderf8c38922012-08-10 13:12:07 -0700446 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500447 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700448 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700449
Alex Eldercc0538b2012-08-10 13:12:07 -0700450 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451
Alex Elder43ae4702012-07-03 16:01:18 -0500452 ceph_opts = ceph_parse_options(options, mon_addr,
453 mon_addr + mon_addr_len,
454 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700455 if (IS_ERR(ceph_opts))
456 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457
Alex Elder1f7ba332012-08-10 13:12:07 -0700458 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700459 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600460 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500461 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700462 } else {
463 rbdc = rbd_client_create(ceph_opts);
464 if (IS_ERR(rbdc))
465 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700466 }
Alex Elderf8c38922012-08-10 13:12:07 -0700467 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468
Alex Elderf8c38922012-08-10 13:12:07 -0700469 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470}
471
472/*
473 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600474 *
Alex Elder432b8582012-01-29 13:57:44 -0600475 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700476 */
477static void rbd_client_release(struct kref *kref)
478{
479 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
480
481 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500482 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500484 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700485
486 ceph_destroy_client(rbdc->client);
487 kfree(rbdc);
488}
489
490/*
491 * Drop reference to ceph client node. If it's not referenced anymore, release
492 * it.
493 */
494static void rbd_put_client(struct rbd_device *rbd_dev)
495{
496 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
497 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700498}
499
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700500/*
501 * Destroy requests collection
502 */
503static void rbd_coll_release(struct kref *kref)
504{
505 struct rbd_req_coll *coll =
506 container_of(kref, struct rbd_req_coll, kref);
507
508 dout("rbd_coll_release %p\n", coll);
509 kfree(coll);
510}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700511
Alex Eldera30b71b2012-07-10 20:30:11 -0500512static bool rbd_image_format_valid(u32 image_format)
513{
514 return image_format == 1 || image_format == 2;
515}
516
Alex Elder8e94af82012-07-25 09:32:40 -0500517static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
518{
Alex Elder103a1502012-08-02 11:29:45 -0500519 size_t size;
520 u32 snap_count;
521
522 /* The header has to start with the magic rbd header text */
523 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
524 return false;
525
526 /*
527 * The size of a snapshot header has to fit in a size_t, and
528 * that limits the number of snapshots.
529 */
530 snap_count = le32_to_cpu(ondisk->snap_count);
531 size = SIZE_MAX - sizeof (struct ceph_snap_context);
532 if (snap_count > size / sizeof (__le64))
533 return false;
534
535 /*
536 * Not only that, but the size of the entire the snapshot
537 * header must also be representable in a size_t.
538 */
539 size -= snap_count * sizeof (__le64);
540 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
541 return false;
542
543 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500544}
545
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700546/*
547 * Create a new header structure, translate header format from the on-disk
548 * header.
549 */
550static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500551 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700552{
Alex Elderccece232012-07-10 20:30:10 -0500553 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500554 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500555 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500556 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557
Alex Elder6a523252012-07-19 17:12:59 -0500558 memset(header, 0, sizeof (*header));
559
Alex Elder103a1502012-08-02 11:29:45 -0500560 snap_count = le32_to_cpu(ondisk->snap_count);
561
Alex Elder58c17b02012-08-23 23:22:06 -0500562 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
563 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500564 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500566 memcpy(header->object_prefix, ondisk->object_prefix, len);
567 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600568
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700569 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500570 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
571
Alex Elder621901d2012-08-23 23:22:06 -0500572 /* Save a copy of the snapshot names */
573
Alex Elderf785cc12012-08-23 23:22:06 -0500574 if (snap_names_len > (u64) SIZE_MAX)
575 return -EIO;
576 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700577 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500578 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500579 /*
580 * Note that rbd_dev_v1_header_read() guarantees
581 * the ondisk buffer we're working with has
582 * snap_names_len bytes beyond the end of the
583 * snapshot id array, this memcpy() is safe.
584 */
585 memcpy(header->snap_names, &ondisk->snaps[snap_count],
586 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500587
Alex Elder621901d2012-08-23 23:22:06 -0500588 /* Record each snapshot's size */
589
Alex Elderd2bb24e2012-07-26 23:37:14 -0500590 size = snap_count * sizeof (*header->snap_sizes);
591 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500593 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500594 for (i = 0; i < snap_count; i++)
595 header->snap_sizes[i] =
596 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597 } else {
Alex Elderccece232012-07-10 20:30:10 -0500598 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599 header->snap_names = NULL;
600 header->snap_sizes = NULL;
601 }
Alex Elder849b4262012-07-09 21:04:24 -0500602
Alex Elder34b13182012-07-13 20:35:12 -0500603 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700604 header->obj_order = ondisk->options.order;
605 header->crypt_type = ondisk->options.crypt_type;
606 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500607
Alex Elder621901d2012-08-23 23:22:06 -0500608 /* Allocate and fill in the snapshot context */
609
Alex Elderf84344f2012-08-31 17:29:51 -0500610 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500611 size = sizeof (struct ceph_snap_context);
612 size += snap_count * sizeof (header->snapc->snaps[0]);
613 header->snapc = kzalloc(size, GFP_KERNEL);
614 if (!header->snapc)
615 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616
617 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500618 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500620 for (i = 0; i < snap_count; i++)
621 header->snapc->snaps[i] =
622 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700623
624 return 0;
625
Alex Elder6a523252012-07-19 17:12:59 -0500626out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500627 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500628 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500630 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500631 kfree(header->object_prefix);
632 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500633
Alex Elder00f1f362012-02-07 12:03:36 -0600634 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700635}
636
Alex Elder8836b992012-08-30 14:42:15 -0500637static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639
Alex Eldere86924a2012-07-10 20:30:11 -0500640 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600641
Alex Eldere86924a2012-07-10 20:30:11 -0500642 list_for_each_entry(snap, &rbd_dev->snaps, node) {
643 if (!strcmp(snap_name, snap->name)) {
644 rbd_dev->mapping.snap_id = snap->id;
645 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500646 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600647
Alex Eldere86924a2012-07-10 20:30:11 -0500648 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600649 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650 }
Alex Eldere86924a2012-07-10 20:30:11 -0500651
Alex Elder00f1f362012-02-07 12:03:36 -0600652 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700653}
654
Alex Elder5ed16172012-08-29 17:11:07 -0500655static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656{
Alex Elder78dc4472012-07-19 08:49:18 -0500657 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658
Alex Elder4e1105a2012-08-31 17:29:52 -0500659 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800660 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500661 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500662 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500663 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500664 rbd_dev->mapping.snap_exists = false;
665 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Alex Eldere86924a2012-07-10 20:30:11 -0500666 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500668 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 if (ret < 0)
670 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500671 rbd_dev->mapping.snap_exists = true;
672 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500674 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700675done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700676 return ret;
677}
678
679static void rbd_header_free(struct rbd_image_header *header)
680{
Alex Elder849b4262012-07-09 21:04:24 -0500681 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500682 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500684 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500685 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500686 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800687 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500688 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689}
690
Alex Elder65ccfe22012-08-09 10:33:26 -0700691static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692{
Alex Elder65ccfe22012-08-09 10:33:26 -0700693 char *name;
694 u64 segment;
695 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696
Alex Elder65ccfe22012-08-09 10:33:26 -0700697 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
698 if (!name)
699 return NULL;
700 segment = offset >> rbd_dev->header.obj_order;
701 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
702 rbd_dev->header.object_prefix, segment);
703 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
704 pr_err("error formatting segment name for #%llu (%d)\n",
705 segment, ret);
706 kfree(name);
707 name = NULL;
708 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709
Alex Elder65ccfe22012-08-09 10:33:26 -0700710 return name;
711}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712
Alex Elder65ccfe22012-08-09 10:33:26 -0700713static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
714{
715 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
Alex Elder65ccfe22012-08-09 10:33:26 -0700717 return offset & (segment_size - 1);
718}
719
720static u64 rbd_segment_length(struct rbd_device *rbd_dev,
721 u64 offset, u64 length)
722{
723 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
724
725 offset &= segment_size - 1;
726
Alex Elderaafb2302012-09-06 16:00:54 -0500727 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700728 if (offset + length > segment_size)
729 length = segment_size - offset;
730
731 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700732}
733
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700734static int rbd_get_num_segments(struct rbd_image_header *header,
735 u64 ofs, u64 len)
736{
Alex Elderdf111be2012-08-09 10:33:26 -0700737 u64 start_seg;
738 u64 end_seg;
739
740 if (!len)
741 return 0;
742 if (len - 1 > U64_MAX - ofs)
743 return -ERANGE;
744
745 start_seg = ofs >> header->obj_order;
746 end_seg = (ofs + len - 1) >> header->obj_order;
747
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700748 return end_seg - start_seg + 1;
749}
750
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700752 * returns the size of an object in the image
753 */
754static u64 rbd_obj_bytes(struct rbd_image_header *header)
755{
756 return 1 << header->obj_order;
757}
758
759/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700760 * bio helpers
761 */
762
763static void bio_chain_put(struct bio *chain)
764{
765 struct bio *tmp;
766
767 while (chain) {
768 tmp = chain;
769 chain = chain->bi_next;
770 bio_put(tmp);
771 }
772}
773
774/*
775 * zeros a bio chain, starting at specific offset
776 */
777static void zero_bio_chain(struct bio *chain, int start_ofs)
778{
779 struct bio_vec *bv;
780 unsigned long flags;
781 void *buf;
782 int i;
783 int pos = 0;
784
785 while (chain) {
786 bio_for_each_segment(bv, chain, i) {
787 if (pos + bv->bv_len > start_ofs) {
788 int remainder = max(start_ofs - pos, 0);
789 buf = bvec_kmap_irq(bv, &flags);
790 memset(buf + remainder, 0,
791 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200792 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700793 }
794 pos += bv->bv_len;
795 }
796
797 chain = chain->bi_next;
798 }
799}
800
801/*
802 * bio_chain_clone - clone a chain of bios up to a certain length.
803 * might return a bio_pair that will need to be released.
804 */
805static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
806 struct bio_pair **bp,
807 int len, gfp_t gfpmask)
808{
Alex Elder542582f2012-08-09 10:33:25 -0700809 struct bio *old_chain = *old;
810 struct bio *new_chain = NULL;
811 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700812 int total = 0;
813
814 if (*bp) {
815 bio_pair_release(*bp);
816 *bp = NULL;
817 }
818
819 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700820 struct bio *tmp;
821
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700822 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
823 if (!tmp)
824 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700825 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700826
827 if (total + old_chain->bi_size > len) {
828 struct bio_pair *bp;
829
830 /*
831 * this split can only happen with a single paged bio,
832 * split_bio will BUG_ON if this is not the case
833 */
834 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500835 "bi_size=%u\n",
836 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700837
838 /* split the bio. We'll release it either in the next
839 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600840 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700841 if (!bp)
842 goto err_out;
843
844 __bio_clone(tmp, &bp->bio1);
845
846 *next = &bp->bio2;
847 } else {
848 __bio_clone(tmp, old_chain);
849 *next = old_chain->bi_next;
850 }
851
852 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700853 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700854 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700855 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700856 else
857 new_chain = tmp;
858 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859 old_chain = old_chain->bi_next;
860
861 total += tmp->bi_size;
862 }
863
Alex Elderaafb2302012-09-06 16:00:54 -0500864 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700866 *old = old_chain;
867
868 return new_chain;
869
870err_out:
871 dout("bio_chain_clone with err\n");
872 bio_chain_put(new_chain);
873 return NULL;
874}
875
876/*
877 * helpers for osd request op vectors.
878 */
Alex Elder57cfc102012-06-26 12:57:03 -0700879static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
880 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700881{
Alex Elder57cfc102012-06-26 12:57:03 -0700882 struct ceph_osd_req_op *ops;
883
884 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
885 if (!ops)
886 return NULL;
887
888 ops[0].op = opcode;
889
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700890 /*
891 * op extent offset and length will be set later on
892 * in calc_raw_layout()
893 */
Alex Elder57cfc102012-06-26 12:57:03 -0700894 ops[0].payload_len = payload_len;
895
896 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700897}
898
899static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
900{
901 kfree(ops);
902}
903
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700904static void rbd_coll_end_req_index(struct request *rq,
905 struct rbd_req_coll *coll,
906 int index,
907 int ret, u64 len)
908{
909 struct request_queue *q;
910 int min, max, i;
911
Alex Elderbd919d42012-07-13 20:35:11 -0500912 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
913 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700914
915 if (!rq)
916 return;
917
918 if (!coll) {
919 blk_end_request(rq, ret, len);
920 return;
921 }
922
923 q = rq->q;
924
925 spin_lock_irq(q->queue_lock);
926 coll->status[index].done = 1;
927 coll->status[index].rc = ret;
928 coll->status[index].bytes = len;
929 max = min = coll->num_done;
930 while (max < coll->total && coll->status[max].done)
931 max++;
932
933 for (i = min; i<max; i++) {
934 __blk_end_request(rq, coll->status[i].rc,
935 coll->status[i].bytes);
936 coll->num_done++;
937 kref_put(&coll->kref, rbd_coll_release);
938 }
939 spin_unlock_irq(q->queue_lock);
940}
941
942static void rbd_coll_end_req(struct rbd_request *req,
943 int ret, u64 len)
944{
945 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
946}
947
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700948/*
949 * Send ceph osd request
950 */
951static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500952 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700953 struct ceph_snap_context *snapc,
954 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500955 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956 struct bio *bio,
957 struct page **pages,
958 int num_pages,
959 int flags,
960 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700961 struct rbd_req_coll *coll,
962 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700964 struct ceph_msg *msg),
965 struct ceph_osd_request **linger_req,
966 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967{
968 struct ceph_osd_request *req;
969 struct ceph_file_layout *layout;
970 int ret;
971 u64 bno;
972 struct timespec mtime = CURRENT_TIME;
973 struct rbd_request *req_data;
974 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600975 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700976
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700977 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700978 if (!req_data) {
979 if (coll)
980 rbd_coll_end_req_index(rq, coll, coll_index,
981 -ENOMEM, len);
982 return -ENOMEM;
983 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700984
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700985 if (coll) {
986 req_data->coll = coll;
987 req_data->coll_index = coll_index;
988 }
989
Alex Elderbd919d42012-07-13 20:35:11 -0500990 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
991 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700992
Alex Elder0ce1a792012-07-03 16:01:18 -0500993 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600994 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
995 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700996 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700997 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700998 goto done_pages;
999 }
1000
1001 req->r_callback = rbd_cb;
1002
1003 req_data->rq = rq;
1004 req_data->bio = bio;
1005 req_data->pages = pages;
1006 req_data->len = len;
1007
1008 req->r_priv = req_data;
1009
1010 reqhead = req->r_request->front.iov_base;
1011 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1012
Alex Elderaded07e2012-07-03 16:01:18 -05001013 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001014 req->r_oid_len = strlen(req->r_oid);
1015
1016 layout = &req->r_file_layout;
1017 memset(layout, 0, sizeof(*layout));
1018 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1019 layout->fl_stripe_count = cpu_to_le32(1);
1020 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001021 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001022 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1023 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024
1025 ceph_osdc_build_request(req, ofs, &len,
1026 ops,
1027 snapc,
1028 &mtime,
1029 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001031 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001032 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001033 *linger_req = req;
1034 }
1035
Alex Elder1dbb4392012-01-24 10:08:37 -06001036 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001037 if (ret < 0)
1038 goto done_err;
1039
1040 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001041 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001042 if (ver)
1043 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001044 dout("reassert_ver=%llu\n",
1045 (unsigned long long)
1046 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001047 ceph_osdc_put_request(req);
1048 }
1049 return ret;
1050
1051done_err:
1052 bio_chain_put(req_data->bio);
1053 ceph_osdc_put_request(req);
1054done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001055 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001057 return ret;
1058}
1059
1060/*
1061 * Ceph osd op callback
1062 */
1063static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1064{
1065 struct rbd_request *req_data = req->r_priv;
1066 struct ceph_osd_reply_head *replyhead;
1067 struct ceph_osd_op *op;
1068 __s32 rc;
1069 u64 bytes;
1070 int read_op;
1071
1072 /* parse reply */
1073 replyhead = msg->front.iov_base;
1074 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1075 op = (void *)(replyhead + 1);
1076 rc = le32_to_cpu(replyhead->result);
1077 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001078 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001079
Alex Elderbd919d42012-07-13 20:35:11 -05001080 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1081 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001082
1083 if (rc == -ENOENT && read_op) {
1084 zero_bio_chain(req_data->bio, 0);
1085 rc = 0;
1086 } else if (rc == 0 && read_op && bytes < req_data->len) {
1087 zero_bio_chain(req_data->bio, bytes);
1088 bytes = req_data->len;
1089 }
1090
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001091 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001092
1093 if (req_data->bio)
1094 bio_chain_put(req_data->bio);
1095
1096 ceph_osdc_put_request(req);
1097 kfree(req_data);
1098}
1099
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001100static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1101{
1102 ceph_osdc_put_request(req);
1103}
1104
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001105/*
1106 * Do a synchronous ceph osd operation
1107 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001108static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109 struct ceph_snap_context *snapc,
1110 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001111 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001112 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001113 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001114 u64 ofs, u64 inbound_size,
1115 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001116 struct ceph_osd_request **linger_req,
1117 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001118{
1119 int ret;
1120 struct page **pages;
1121 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001122
Alex Elderaafb2302012-09-06 16:00:54 -05001123 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001124
Alex Elderf8d4de62012-07-03 16:01:19 -05001125 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001126 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001127 if (IS_ERR(pages))
1128 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129
Alex Elder0ce1a792012-07-03 16:01:18 -05001130 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001131 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001132 pages, num_pages,
1133 flags,
1134 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001135 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001136 NULL,
1137 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001138 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001139 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001140
Alex Elderf8d4de62012-07-03 16:01:19 -05001141 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1142 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144done:
1145 ceph_release_page_vector(pages, num_pages);
1146 return ret;
1147}
1148
1149/*
1150 * Do an asynchronous ceph osd operation
1151 */
1152static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001153 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154 struct ceph_snap_context *snapc,
1155 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001156 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001157 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001158 struct bio *bio,
1159 struct rbd_req_coll *coll,
1160 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161{
1162 char *seg_name;
1163 u64 seg_ofs;
1164 u64 seg_len;
1165 int ret;
1166 struct ceph_osd_req_op *ops;
1167 u32 payload_len;
1168
Alex Elder65ccfe22012-08-09 10:33:26 -07001169 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170 if (!seg_name)
1171 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001172 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1173 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001174
1175 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1176
Alex Elder57cfc102012-06-26 12:57:03 -07001177 ret = -ENOMEM;
1178 ops = rbd_create_rw_ops(1, opcode, payload_len);
1179 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180 goto done;
1181
1182 /* we've taken care of segment sizes earlier when we
1183 cloned the bios. We should never have a segment
1184 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001185 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001186
1187 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1188 seg_name, seg_ofs, seg_len,
1189 bio,
1190 NULL, 0,
1191 flags,
1192 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001193 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001194 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001195
1196 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001197done:
1198 kfree(seg_name);
1199 return ret;
1200}
1201
1202/*
1203 * Request async osd write
1204 */
1205static int rbd_req_write(struct request *rq,
1206 struct rbd_device *rbd_dev,
1207 struct ceph_snap_context *snapc,
1208 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001209 struct bio *bio,
1210 struct rbd_req_coll *coll,
1211 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212{
1213 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1214 CEPH_OSD_OP_WRITE,
1215 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001216 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001217}
1218
1219/*
1220 * Request async osd read
1221 */
1222static int rbd_req_read(struct request *rq,
1223 struct rbd_device *rbd_dev,
1224 u64 snapid,
1225 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001226 struct bio *bio,
1227 struct rbd_req_coll *coll,
1228 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001229{
1230 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001231 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001232 CEPH_OSD_OP_READ,
1233 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001234 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001235}
1236
1237/*
1238 * Request sync osd read
1239 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001240static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001241 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001242 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001243 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001244 char *buf,
1245 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001246{
Alex Elder913d2fd2012-06-26 12:57:03 -07001247 struct ceph_osd_req_op *ops;
1248 int ret;
1249
1250 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1251 if (!ops)
1252 return -ENOMEM;
1253
1254 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001255 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001256 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001257 ops, object_name, ofs, len, buf, NULL, ver);
1258 rbd_destroy_ops(ops);
1259
1260 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261}
1262
1263/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001264 * Request sync osd watch
1265 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001266static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001267 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001268 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269{
1270 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001271 int ret;
1272
Alex Elder57cfc102012-06-26 12:57:03 -07001273 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1274 if (!ops)
1275 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001276
Josh Durgina71b8912011-12-05 18:10:44 -08001277 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001278 ops[0].watch.cookie = notify_id;
1279 ops[0].watch.flag = 0;
1280
Alex Elder0ce1a792012-07-03 16:01:18 -05001281 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001282 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001283 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001284 CEPH_OSD_FLAG_READ,
1285 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001286 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001287 rbd_simple_req_cb, 0, NULL);
1288
1289 rbd_destroy_ops(ops);
1290 return ret;
1291}
1292
1293static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1294{
Alex Elder0ce1a792012-07-03 16:01:18 -05001295 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001296 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001297 int rc;
1298
Alex Elder0ce1a792012-07-03 16:01:18 -05001299 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001300 return;
1301
Alex Elderbd919d42012-07-13 20:35:11 -05001302 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1303 rbd_dev->header_name, (unsigned long long) notify_id,
1304 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001305 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001306 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001307 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001308 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001309
Alex Elder7f0a24d2012-07-25 09:32:40 -05001310 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001311}
1312
1313/*
1314 * Request sync osd watch
1315 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001316static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001317{
1318 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001319 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001320 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321
Alex Elder57cfc102012-06-26 12:57:03 -07001322 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1323 if (!ops)
1324 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001325
1326 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001327 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001328 if (ret < 0)
1329 goto fail;
1330
Alex Elder0e6f3222012-07-25 09:32:40 -05001331 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001332 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001333 ops[0].watch.flag = 1;
1334
Alex Elder0ce1a792012-07-03 16:01:18 -05001335 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001336 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001337 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1338 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001339 rbd_dev->header_name,
1340 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001341 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001342
1343 if (ret < 0)
1344 goto fail_event;
1345
1346 rbd_destroy_ops(ops);
1347 return 0;
1348
1349fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001350 ceph_osdc_cancel_event(rbd_dev->watch_event);
1351 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001352fail:
1353 rbd_destroy_ops(ops);
1354 return ret;
1355}
1356
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001357/*
1358 * Request sync osd unwatch
1359 */
Alex Elder070c6332012-07-25 09:32:41 -05001360static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001361{
1362 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001363 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001364
Alex Elder57cfc102012-06-26 12:57:03 -07001365 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1366 if (!ops)
1367 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001368
1369 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001370 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001371 ops[0].watch.flag = 0;
1372
Alex Elder0ce1a792012-07-03 16:01:18 -05001373 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001374 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001375 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1376 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001377 rbd_dev->header_name,
1378 0, 0, NULL, NULL, NULL);
1379
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001380
1381 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001382 ceph_osdc_cancel_event(rbd_dev->watch_event);
1383 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001384 return ret;
1385}
1386
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001387/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001388 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001389 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001390static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001391 const char *object_name,
1392 const char *class_name,
1393 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001394 const char *outbound,
1395 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001396 char *inbound,
1397 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001398 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001399 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001400{
1401 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001402 int class_name_len = strlen(class_name);
1403 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001404 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001405 int ret;
1406
Alex Elder3cb4a682012-06-26 12:57:03 -07001407 /*
1408 * Any input parameters required by the method we're calling
1409 * will be sent along with the class and method names as
1410 * part of the message payload. That data and its size are
1411 * supplied via the indata and indata_len fields (named from
1412 * the perspective of the server side) in the OSD request
1413 * operation.
1414 */
1415 payload_size = class_name_len + method_name_len + outbound_size;
1416 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001417 if (!ops)
1418 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001419
Alex Elderaded07e2012-07-03 16:01:18 -05001420 ops[0].cls.class_name = class_name;
1421 ops[0].cls.class_len = (__u8) class_name_len;
1422 ops[0].cls.method_name = method_name;
1423 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001424 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001425 ops[0].cls.indata = outbound;
1426 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001427
Alex Elder0ce1a792012-07-03 16:01:18 -05001428 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001429 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001430 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001431 object_name, 0, inbound_size, inbound,
1432 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001433
1434 rbd_destroy_ops(ops);
1435
1436 dout("cls_exec returned %d\n", ret);
1437 return ret;
1438}
1439
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001440static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1441{
1442 struct rbd_req_coll *coll =
1443 kzalloc(sizeof(struct rbd_req_coll) +
1444 sizeof(struct rbd_req_status) * num_reqs,
1445 GFP_ATOMIC);
1446
1447 if (!coll)
1448 return NULL;
1449 coll->total = num_reqs;
1450 kref_init(&coll->kref);
1451 return coll;
1452}
1453
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001454/*
1455 * block device queue callback
1456 */
1457static void rbd_rq_fn(struct request_queue *q)
1458{
1459 struct rbd_device *rbd_dev = q->queuedata;
1460 struct request *rq;
1461 struct bio_pair *bp = NULL;
1462
Alex Elder00f1f362012-02-07 12:03:36 -06001463 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001464 struct bio *bio;
1465 struct bio *rq_bio, *next_bio = NULL;
1466 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001467 unsigned int size;
1468 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001469 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001470 int num_segs, cur_seg = 0;
1471 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001472 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474 dout("fetched request\n");
1475
1476 /* filter out block requests we don't understand */
1477 if ((rq->cmd_type != REQ_TYPE_FS)) {
1478 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001479 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480 }
1481
1482 /* deduce our operation (read, write) */
1483 do_write = (rq_data_dir(rq) == WRITE);
1484
1485 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001486 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001487 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001488 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001489 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001490 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001491 }
1492
1493 spin_unlock_irq(q->queue_lock);
1494
Josh Durgind1d25642011-12-05 14:03:05 -08001495 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001496
Alex Elderf84344f2012-08-31 17:29:51 -05001497 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1498 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001499 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001500 dout("request for non-existent snapshot");
1501 spin_lock_irq(q->queue_lock);
1502 __blk_end_request_all(rq, -ENXIO);
1503 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001504 }
1505
Josh Durgind1d25642011-12-05 14:03:05 -08001506 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1507
1508 up_read(&rbd_dev->header_rwsem);
1509
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001510 dout("%s 0x%x bytes at 0x%llx\n",
1511 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001512 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001515 if (num_segs <= 0) {
1516 spin_lock_irq(q->queue_lock);
1517 __blk_end_request_all(rq, num_segs);
1518 ceph_put_snap_context(snapc);
1519 continue;
1520 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521 coll = rbd_alloc_coll(num_segs);
1522 if (!coll) {
1523 spin_lock_irq(q->queue_lock);
1524 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001525 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001526 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001527 }
1528
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529 do {
1530 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001531 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001532 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001533 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1535 op_size, GFP_ATOMIC);
1536 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001537 rbd_coll_end_req_index(rq, coll, cur_seg,
1538 -ENOMEM, op_size);
1539 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 }
1541
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001542
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 /* init OSD command: write or read */
1544 if (do_write)
1545 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001546 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001547 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001548 op_size, bio,
1549 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001550 else
1551 rbd_req_read(rq, rbd_dev,
Alex Elderf84344f2012-08-31 17:29:51 -05001552 rbd_dev->mapping.snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001554 op_size, bio,
1555 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001556
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001557next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558 size -= op_size;
1559 ofs += op_size;
1560
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001561 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001562 rq_bio = next_bio;
1563 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001564 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001565
1566 if (bp)
1567 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001568 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001569
1570 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001571 }
1572}
1573
1574/*
1575 * a queue callback. Makes sure that we don't create a bio that spans across
1576 * multiple osd objects. One exception would be with a single page bios,
1577 * which we handle later at bio_chain_clone
1578 */
1579static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1580 struct bio_vec *bvec)
1581{
1582 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001583 unsigned int chunk_sectors;
1584 sector_t sector;
1585 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001586 int max;
1587
Alex Elder593a9e72012-02-07 12:03:37 -06001588 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1589 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1590 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1591
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001592 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001593 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001594 if (max < 0)
1595 max = 0; /* bio_add cannot handle a negative return */
1596 if (max <= bvec->bv_len && bio_sectors == 0)
1597 return bvec->bv_len;
1598 return max;
1599}
1600
1601static void rbd_free_disk(struct rbd_device *rbd_dev)
1602{
1603 struct gendisk *disk = rbd_dev->disk;
1604
1605 if (!disk)
1606 return;
1607
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001608 if (disk->flags & GENHD_FL_UP)
1609 del_gendisk(disk);
1610 if (disk->queue)
1611 blk_cleanup_queue(disk->queue);
1612 put_disk(disk);
1613}
1614
1615/*
Alex Elder4156d992012-08-02 11:29:46 -05001616 * Read the complete header for the given rbd device.
1617 *
1618 * Returns a pointer to a dynamically-allocated buffer containing
1619 * the complete and validated header. Caller can pass the address
1620 * of a variable that will be filled in with the version of the
1621 * header object at the time it was read.
1622 *
1623 * Returns a pointer-coded errno if a failure occurs.
1624 */
1625static struct rbd_image_header_ondisk *
1626rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1627{
1628 struct rbd_image_header_ondisk *ondisk = NULL;
1629 u32 snap_count = 0;
1630 u64 names_size = 0;
1631 u32 want_count;
1632 int ret;
1633
1634 /*
1635 * The complete header will include an array of its 64-bit
1636 * snapshot ids, followed by the names of those snapshots as
1637 * a contiguous block of NUL-terminated strings. Note that
1638 * the number of snapshots could change by the time we read
1639 * it in, in which case we re-read it.
1640 */
1641 do {
1642 size_t size;
1643
1644 kfree(ondisk);
1645
1646 size = sizeof (*ondisk);
1647 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1648 size += names_size;
1649 ondisk = kmalloc(size, GFP_KERNEL);
1650 if (!ondisk)
1651 return ERR_PTR(-ENOMEM);
1652
1653 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1654 rbd_dev->header_name,
1655 0, size,
1656 (char *) ondisk, version);
1657
1658 if (ret < 0)
1659 goto out_err;
1660 if (WARN_ON((size_t) ret < size)) {
1661 ret = -ENXIO;
1662 pr_warning("short header read for image %s"
1663 " (want %zd got %d)\n",
1664 rbd_dev->image_name, size, ret);
1665 goto out_err;
1666 }
1667 if (!rbd_dev_ondisk_valid(ondisk)) {
1668 ret = -ENXIO;
1669 pr_warning("invalid header for image %s\n",
1670 rbd_dev->image_name);
1671 goto out_err;
1672 }
1673
1674 names_size = le64_to_cpu(ondisk->snap_names_len);
1675 want_count = snap_count;
1676 snap_count = le32_to_cpu(ondisk->snap_count);
1677 } while (snap_count != want_count);
1678
1679 return ondisk;
1680
1681out_err:
1682 kfree(ondisk);
1683
1684 return ERR_PTR(ret);
1685}
1686
1687/*
1688 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001689 */
1690static int rbd_read_header(struct rbd_device *rbd_dev,
1691 struct rbd_image_header *header)
1692{
Alex Elder4156d992012-08-02 11:29:46 -05001693 struct rbd_image_header_ondisk *ondisk;
1694 u64 ver = 0;
1695 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001696
Alex Elder4156d992012-08-02 11:29:46 -05001697 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1698 if (IS_ERR(ondisk))
1699 return PTR_ERR(ondisk);
1700 ret = rbd_header_from_disk(header, ondisk);
1701 if (ret >= 0)
1702 header->obj_version = ver;
1703 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001704
Alex Elder4156d992012-08-02 11:29:46 -05001705 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001706}
1707
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001708static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1709{
1710 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001711 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001712
Alex Eldera0593292012-07-19 09:09:27 -05001713 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001714 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001715}
1716
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001717/*
1718 * only read the first part of the ondisk header, without the snaps info
1719 */
Alex Elderb8136232012-07-25 09:32:41 -05001720static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001721{
1722 int ret;
1723 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001724
1725 ret = rbd_read_header(rbd_dev, &h);
1726 if (ret < 0)
1727 return ret;
1728
Josh Durgina51aa0c2011-12-05 10:35:04 -08001729 down_write(&rbd_dev->header_rwsem);
1730
Sage Weil9db4b3e2011-04-19 22:49:06 -07001731 /* resized? */
Alex Elderf84344f2012-08-31 17:29:51 -05001732 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
Josh Durgin474ef7c2011-11-21 17:13:54 -08001733 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1734
Alex Elder99c1f082012-08-30 14:42:15 -05001735 if (size != (sector_t) rbd_dev->mapping.size) {
1736 dout("setting size to %llu sectors",
1737 (unsigned long long) size);
1738 rbd_dev->mapping.size = (u64) size;
1739 set_capacity(rbd_dev->disk, size);
1740 }
Josh Durgin474ef7c2011-11-21 17:13:54 -08001741 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001742
Alex Elder849b4262012-07-09 21:04:24 -05001743 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001745 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001746 /* osd requests may still refer to snapc */
1747 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001748
Alex Elderb8136232012-07-25 09:32:41 -05001749 if (hver)
1750 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001751 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001752 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001753 rbd_dev->header.snapc = h.snapc;
1754 rbd_dev->header.snap_names = h.snap_names;
1755 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001756 /* Free the extra copy of the object prefix */
1757 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1758 kfree(h.object_prefix);
1759
Alex Elder304f6802012-08-31 17:29:52 -05001760 ret = rbd_dev_snaps_update(rbd_dev);
1761 if (!ret)
1762 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001763
Josh Durginc6666012011-11-21 17:11:12 -08001764 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001765
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001766 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001767}
1768
Alex Elder1fe5e992012-07-25 09:32:41 -05001769static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1770{
1771 int ret;
1772
1773 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1774 ret = __rbd_refresh_header(rbd_dev, hver);
1775 mutex_unlock(&ctl_mutex);
1776
1777 return ret;
1778}
1779
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780static int rbd_init_disk(struct rbd_device *rbd_dev)
1781{
1782 struct gendisk *disk;
1783 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001784 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001785
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001787 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1788 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001789 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001790
Alex Elderf0f8cef2012-01-29 13:57:44 -06001791 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001792 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001793 disk->major = rbd_dev->major;
1794 disk->first_minor = 0;
1795 disk->fops = &rbd_bd_ops;
1796 disk->private_data = rbd_dev;
1797
1798 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1800 if (!q)
1801 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001802
Alex Elder593a9e72012-02-07 12:03:37 -06001803 /* We use the default size, but let's be explicit about it. */
1804 blk_queue_physical_block_size(q, SECTOR_SIZE);
1805
Josh Durgin029bcbd2011-07-22 11:35:23 -07001806 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001807 segment_size = rbd_obj_bytes(&rbd_dev->header);
1808 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1809 blk_queue_max_segment_size(q, segment_size);
1810 blk_queue_io_min(q, segment_size);
1811 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001812
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813 blk_queue_merge_bvec(q, rbd_merge_bvec);
1814 disk->queue = q;
1815
1816 q->queuedata = rbd_dev;
1817
1818 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001819
Alex Elder12f02942012-08-29 17:11:07 -05001820 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1821
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001822 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001823out_disk:
1824 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001825
1826 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001827}
1828
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001829/*
1830 sysfs
1831*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832
Alex Elder593a9e72012-02-07 12:03:37 -06001833static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1834{
1835 return container_of(dev, struct rbd_device, dev);
1836}
1837
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001838static ssize_t rbd_size_show(struct device *dev,
1839 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840{
Alex Elder593a9e72012-02-07 12:03:37 -06001841 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001842 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001843
Josh Durgina51aa0c2011-12-05 10:35:04 -08001844 down_read(&rbd_dev->header_rwsem);
1845 size = get_capacity(rbd_dev->disk);
1846 up_read(&rbd_dev->header_rwsem);
1847
1848 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001849}
1850
Alex Elder34b13182012-07-13 20:35:12 -05001851/*
1852 * Note this shows the features for whatever's mapped, which is not
1853 * necessarily the base image.
1854 */
1855static ssize_t rbd_features_show(struct device *dev,
1856 struct device_attribute *attr, char *buf)
1857{
1858 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1859
1860 return sprintf(buf, "0x%016llx\n",
1861 (unsigned long long) rbd_dev->mapping.features);
1862}
1863
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001864static ssize_t rbd_major_show(struct device *dev,
1865 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001866{
Alex Elder593a9e72012-02-07 12:03:37 -06001867 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001868
1869 return sprintf(buf, "%d\n", rbd_dev->major);
1870}
1871
1872static ssize_t rbd_client_id_show(struct device *dev,
1873 struct device_attribute *attr, char *buf)
1874{
Alex Elder593a9e72012-02-07 12:03:37 -06001875 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001876
Alex Elder1dbb4392012-01-24 10:08:37 -06001877 return sprintf(buf, "client%lld\n",
1878 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001879}
1880
1881static ssize_t rbd_pool_show(struct device *dev,
1882 struct device_attribute *attr, char *buf)
1883{
Alex Elder593a9e72012-02-07 12:03:37 -06001884 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001885
1886 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1887}
1888
Alex Elder9bb2f332012-07-12 10:46:35 -05001889static ssize_t rbd_pool_id_show(struct device *dev,
1890 struct device_attribute *attr, char *buf)
1891{
1892 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1893
1894 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1895}
1896
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001897static ssize_t rbd_name_show(struct device *dev,
1898 struct device_attribute *attr, char *buf)
1899{
Alex Elder593a9e72012-02-07 12:03:37 -06001900 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001901
Alex Elder0bed54d2012-07-03 16:01:18 -05001902 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001903}
1904
Alex Elder589d30e2012-07-10 20:30:11 -05001905static ssize_t rbd_image_id_show(struct device *dev,
1906 struct device_attribute *attr, char *buf)
1907{
1908 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1909
1910 return sprintf(buf, "%s\n", rbd_dev->image_id);
1911}
1912
Alex Elder34b13182012-07-13 20:35:12 -05001913/*
1914 * Shows the name of the currently-mapped snapshot (or
1915 * RBD_SNAP_HEAD_NAME for the base image).
1916 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917static ssize_t rbd_snap_show(struct device *dev,
1918 struct device_attribute *attr,
1919 char *buf)
1920{
Alex Elder593a9e72012-02-07 12:03:37 -06001921 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001922
Alex Elderf84344f2012-08-31 17:29:51 -05001923 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001924}
1925
1926static ssize_t rbd_image_refresh(struct device *dev,
1927 struct device_attribute *attr,
1928 const char *buf,
1929 size_t size)
1930{
Alex Elder593a9e72012-02-07 12:03:37 -06001931 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001932 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001933
Alex Elder1fe5e992012-07-25 09:32:41 -05001934 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001935
1936 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001937}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001938
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001939static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05001940static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001941static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1942static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1943static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001944static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001945static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05001946static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001947static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1948static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001949
1950static struct attribute *rbd_attrs[] = {
1951 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05001952 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001953 &dev_attr_major.attr,
1954 &dev_attr_client_id.attr,
1955 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001956 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001957 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05001958 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001959 &dev_attr_current_snap.attr,
1960 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001961 NULL
1962};
1963
1964static struct attribute_group rbd_attr_group = {
1965 .attrs = rbd_attrs,
1966};
1967
1968static const struct attribute_group *rbd_attr_groups[] = {
1969 &rbd_attr_group,
1970 NULL
1971};
1972
1973static void rbd_sysfs_dev_release(struct device *dev)
1974{
1975}
1976
1977static struct device_type rbd_device_type = {
1978 .name = "rbd",
1979 .groups = rbd_attr_groups,
1980 .release = rbd_sysfs_dev_release,
1981};
1982
1983
1984/*
1985 sysfs - snapshots
1986*/
1987
1988static ssize_t rbd_snap_size_show(struct device *dev,
1989 struct device_attribute *attr,
1990 char *buf)
1991{
1992 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1993
Josh Durgin35915382011-12-05 18:25:13 -08001994 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001995}
1996
1997static ssize_t rbd_snap_id_show(struct device *dev,
1998 struct device_attribute *attr,
1999 char *buf)
2000{
2001 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2002
Josh Durgin35915382011-12-05 18:25:13 -08002003 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002004}
2005
Alex Elder34b13182012-07-13 20:35:12 -05002006static ssize_t rbd_snap_features_show(struct device *dev,
2007 struct device_attribute *attr,
2008 char *buf)
2009{
2010 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2011
2012 return sprintf(buf, "0x%016llx\n",
2013 (unsigned long long) snap->features);
2014}
2015
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002016static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2017static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002018static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002019
2020static struct attribute *rbd_snap_attrs[] = {
2021 &dev_attr_snap_size.attr,
2022 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002023 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002024 NULL,
2025};
2026
2027static struct attribute_group rbd_snap_attr_group = {
2028 .attrs = rbd_snap_attrs,
2029};
2030
2031static void rbd_snap_dev_release(struct device *dev)
2032{
2033 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2034 kfree(snap->name);
2035 kfree(snap);
2036}
2037
2038static const struct attribute_group *rbd_snap_attr_groups[] = {
2039 &rbd_snap_attr_group,
2040 NULL
2041};
2042
2043static struct device_type rbd_snap_device_type = {
2044 .groups = rbd_snap_attr_groups,
2045 .release = rbd_snap_dev_release,
2046};
2047
Alex Elder304f6802012-08-31 17:29:52 -05002048static bool rbd_snap_registered(struct rbd_snap *snap)
2049{
2050 bool ret = snap->dev.type == &rbd_snap_device_type;
2051 bool reg = device_is_registered(&snap->dev);
2052
2053 rbd_assert(!ret ^ reg);
2054
2055 return ret;
2056}
2057
Alex Elder14e70852012-07-19 09:09:27 -05002058static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002059{
2060 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002061 if (device_is_registered(&snap->dev))
2062 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002063}
2064
Alex Elder14e70852012-07-19 09:09:27 -05002065static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002066 struct device *parent)
2067{
2068 struct device *dev = &snap->dev;
2069 int ret;
2070
2071 dev->type = &rbd_snap_device_type;
2072 dev->parent = parent;
2073 dev->release = rbd_snap_dev_release;
2074 dev_set_name(dev, "snap_%s", snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002075 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2076
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002077 ret = device_register(dev);
2078
2079 return ret;
2080}
2081
Alex Elder4e891e02012-07-10 20:30:10 -05002082static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002083 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002084 u64 snap_id, u64 snap_size,
2085 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002086{
Alex Elder4e891e02012-07-10 20:30:10 -05002087 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002088 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002089
2090 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002091 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002092 return ERR_PTR(-ENOMEM);
2093
2094 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002095 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002096 if (!snap->name)
2097 goto err;
2098
Alex Elderc8d18422012-07-10 20:30:11 -05002099 snap->id = snap_id;
2100 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002101 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002102
2103 return snap;
2104
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105err:
2106 kfree(snap->name);
2107 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002108
2109 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002110}
2111
Alex Eldercd892122012-07-03 16:01:19 -05002112static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2113 u64 *snap_size, u64 *snap_features)
2114{
2115 char *snap_name;
2116
2117 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2118
2119 *snap_size = rbd_dev->header.snap_sizes[which];
2120 *snap_features = 0; /* No features for v1 */
2121
2122 /* Skip over names until we find the one we are looking for */
2123
2124 snap_name = rbd_dev->header.snap_names;
2125 while (which--)
2126 snap_name += strlen(snap_name) + 1;
2127
2128 return snap_name;
2129}
2130
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002131/*
Alex Elder9d475de2012-07-03 16:01:19 -05002132 * Get the size and object order for an image snapshot, or if
2133 * snap_id is CEPH_NOSNAP, gets this information for the base
2134 * image.
2135 */
2136static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2137 u8 *order, u64 *snap_size)
2138{
2139 __le64 snapid = cpu_to_le64(snap_id);
2140 int ret;
2141 struct {
2142 u8 order;
2143 __le64 size;
2144 } __attribute__ ((packed)) size_buf = { 0 };
2145
2146 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2147 "rbd", "get_size",
2148 (char *) &snapid, sizeof (snapid),
2149 (char *) &size_buf, sizeof (size_buf),
2150 CEPH_OSD_FLAG_READ, NULL);
2151 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2152 if (ret < 0)
2153 return ret;
2154
2155 *order = size_buf.order;
2156 *snap_size = le64_to_cpu(size_buf.size);
2157
2158 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2159 (unsigned long long) snap_id, (unsigned int) *order,
2160 (unsigned long long) *snap_size);
2161
2162 return 0;
2163}
2164
2165static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2166{
2167 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2168 &rbd_dev->header.obj_order,
2169 &rbd_dev->header.image_size);
2170}
2171
Alex Elder1e130192012-07-03 16:01:19 -05002172static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2173{
2174 void *reply_buf;
2175 int ret;
2176 void *p;
2177
2178 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2179 if (!reply_buf)
2180 return -ENOMEM;
2181
2182 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2183 "rbd", "get_object_prefix",
2184 NULL, 0,
2185 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2186 CEPH_OSD_FLAG_READ, NULL);
2187 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2188 if (ret < 0)
2189 goto out;
2190
2191 p = reply_buf;
2192 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2193 p + RBD_OBJ_PREFIX_LEN_MAX,
2194 NULL, GFP_NOIO);
2195
2196 if (IS_ERR(rbd_dev->header.object_prefix)) {
2197 ret = PTR_ERR(rbd_dev->header.object_prefix);
2198 rbd_dev->header.object_prefix = NULL;
2199 } else {
2200 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2201 }
2202
2203out:
2204 kfree(reply_buf);
2205
2206 return ret;
2207}
2208
Alex Elderb1b54022012-07-03 16:01:19 -05002209static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2210 u64 *snap_features)
2211{
2212 __le64 snapid = cpu_to_le64(snap_id);
2213 struct {
2214 __le64 features;
2215 __le64 incompat;
2216 } features_buf = { 0 };
2217 int ret;
2218
2219 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2220 "rbd", "get_features",
2221 (char *) &snapid, sizeof (snapid),
2222 (char *) &features_buf, sizeof (features_buf),
2223 CEPH_OSD_FLAG_READ, NULL);
2224 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2225 if (ret < 0)
2226 return ret;
2227 *snap_features = le64_to_cpu(features_buf.features);
2228
2229 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2230 (unsigned long long) snap_id,
2231 (unsigned long long) *snap_features,
2232 (unsigned long long) le64_to_cpu(features_buf.incompat));
2233
2234 return 0;
2235}
2236
2237static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2238{
2239 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2240 &rbd_dev->header.features);
2241}
2242
Alex Elder9d475de2012-07-03 16:01:19 -05002243/*
Alex Elder35938152012-08-02 11:29:46 -05002244 * Scan the rbd device's current snapshot list and compare it to the
2245 * newly-received snapshot context. Remove any existing snapshots
2246 * not present in the new snapshot context. Add a new snapshot for
2247 * any snaphots in the snapshot context not in the current list.
2248 * And verify there are no changes to snapshots we already know
2249 * about.
2250 *
2251 * Assumes the snapshots in the snapshot context are sorted by
2252 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2253 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002254 */
Alex Elder304f6802012-08-31 17:29:52 -05002255static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002256{
Alex Elder35938152012-08-02 11:29:46 -05002257 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2258 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002259 struct list_head *head = &rbd_dev->snaps;
2260 struct list_head *links = head->next;
2261 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002262
Alex Elder9fcbb802012-08-23 23:48:49 -05002263 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002264 while (index < snap_count || links != head) {
2265 u64 snap_id;
2266 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002267 char *snap_name;
2268 u64 snap_size = 0;
2269 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002270
Alex Elder35938152012-08-02 11:29:46 -05002271 snap_id = index < snap_count ? snapc->snaps[index]
2272 : CEPH_NOSNAP;
2273 snap = links != head ? list_entry(links, struct rbd_snap, node)
2274 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002275 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002276
Alex Elder35938152012-08-02 11:29:46 -05002277 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2278 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002279
Alex Elder35938152012-08-02 11:29:46 -05002280 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002281
Alex Elderf84344f2012-08-31 17:29:51 -05002282 if (rbd_dev->mapping.snap_id == snap->id)
2283 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002284 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002285 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002286 rbd_dev->mapping.snap_id == snap->id ?
2287 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002288 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002289
Alex Elder35938152012-08-02 11:29:46 -05002290 /* Done with this list entry; advance */
2291
2292 links = next;
2293 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002294 }
Alex Elder35938152012-08-02 11:29:46 -05002295
Alex Eldercd892122012-07-03 16:01:19 -05002296 snap_name = rbd_dev_v1_snap_info(rbd_dev, index,
2297 &snap_size, &snap_features);
2298 if (IS_ERR(snap_name))
2299 return PTR_ERR(snap_name);
2300
Alex Elder9fcbb802012-08-23 23:48:49 -05002301 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2302 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002303 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2304 struct rbd_snap *new_snap;
2305
2306 /* We haven't seen this snapshot before */
2307
Alex Elderc8d18422012-07-10 20:30:11 -05002308 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002309 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002310 if (IS_ERR(new_snap)) {
2311 int err = PTR_ERR(new_snap);
2312
2313 dout(" failed to add dev, error %d\n", err);
2314
2315 return err;
2316 }
Alex Elder35938152012-08-02 11:29:46 -05002317
2318 /* New goes before existing, or at end of list */
2319
Alex Elder9fcbb802012-08-23 23:48:49 -05002320 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002321 if (snap)
2322 list_add_tail(&new_snap->node, &snap->node);
2323 else
Alex Elder523f3252012-08-30 00:16:37 -05002324 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002325 } else {
2326 /* Already have this one */
2327
Alex Elder9fcbb802012-08-23 23:48:49 -05002328 dout(" already present\n");
2329
Alex Eldercd892122012-07-03 16:01:19 -05002330 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002331 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002332 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002333
2334 /* Done with this list entry; advance */
2335
2336 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002337 }
Alex Elder35938152012-08-02 11:29:46 -05002338
2339 /* Advance to the next entry in the snapshot context */
2340
2341 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002342 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002343 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002344
2345 return 0;
2346}
2347
Alex Elder304f6802012-08-31 17:29:52 -05002348/*
2349 * Scan the list of snapshots and register the devices for any that
2350 * have not already been registered.
2351 */
2352static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2353{
2354 struct rbd_snap *snap;
2355 int ret = 0;
2356
2357 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002358 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2359 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002360
2361 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2362 if (!rbd_snap_registered(snap)) {
2363 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2364 if (ret < 0)
2365 break;
2366 }
2367 }
2368 dout("%s: returning %d\n", __func__, ret);
2369
2370 return ret;
2371}
2372
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002373static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2374{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002375 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002376 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002377
2378 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002379
Alex Eldercd789ab2012-08-30 00:16:38 -05002380 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002381 dev->bus = &rbd_bus_type;
2382 dev->type = &rbd_device_type;
2383 dev->parent = &rbd_root_dev;
2384 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002385 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002386 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002387
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002388 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002389
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002390 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002391}
2392
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002393static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2394{
2395 device_unregister(&rbd_dev->dev);
2396}
2397
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002398static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2399{
2400 int ret, rc;
2401
2402 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002403 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002404 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002405 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002406 if (rc < 0)
2407 return rc;
2408 }
2409 } while (ret == -ERANGE);
2410
2411 return ret;
2412}
2413
Alex Eldere2839302012-08-29 17:11:06 -05002414static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002415
2416/*
Alex Elder499afd52012-02-02 08:13:29 -06002417 * Get a unique rbd identifier for the given new rbd_dev, and add
2418 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002419 */
Alex Eldere2839302012-08-29 17:11:06 -05002420static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002421{
Alex Eldere2839302012-08-29 17:11:06 -05002422 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002423
2424 spin_lock(&rbd_dev_list_lock);
2425 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2426 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002427 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2428 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002429}
Alex Elderb7f23c32012-01-29 13:57:43 -06002430
Alex Elder1ddbe942012-01-29 13:57:44 -06002431/*
Alex Elder499afd52012-02-02 08:13:29 -06002432 * Remove an rbd_dev from the global list, and record that its
2433 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002434 */
Alex Eldere2839302012-08-29 17:11:06 -05002435static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002436{
Alex Elderd184f6b2012-01-29 13:57:44 -06002437 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002438 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002439 int max_id;
2440
Alex Elderaafb2302012-09-06 16:00:54 -05002441 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002442
Alex Eldere2839302012-08-29 17:11:06 -05002443 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2444 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002445 spin_lock(&rbd_dev_list_lock);
2446 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002447
2448 /*
2449 * If the id being "put" is not the current maximum, there
2450 * is nothing special we need to do.
2451 */
Alex Eldere2839302012-08-29 17:11:06 -05002452 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002453 spin_unlock(&rbd_dev_list_lock);
2454 return;
2455 }
2456
2457 /*
2458 * We need to update the current maximum id. Search the
2459 * list to find out what it is. We're more likely to find
2460 * the maximum at the end, so search the list backward.
2461 */
2462 max_id = 0;
2463 list_for_each_prev(tmp, &rbd_dev_list) {
2464 struct rbd_device *rbd_dev;
2465
2466 rbd_dev = list_entry(tmp, struct rbd_device, node);
2467 if (rbd_id > max_id)
2468 max_id = rbd_id;
2469 }
Alex Elder499afd52012-02-02 08:13:29 -06002470 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002471
Alex Elder1ddbe942012-01-29 13:57:44 -06002472 /*
Alex Eldere2839302012-08-29 17:11:06 -05002473 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002474 * which case it now accurately reflects the new maximum.
2475 * Be careful not to overwrite the maximum value in that
2476 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002477 */
Alex Eldere2839302012-08-29 17:11:06 -05002478 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2479 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002480}
2481
Alex Eldera725f65e2012-02-02 08:13:30 -06002482/*
Alex Eldere28fff262012-02-02 08:13:30 -06002483 * Skips over white space at *buf, and updates *buf to point to the
2484 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002485 * the token (string of non-white space characters) found. Note
2486 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002487 */
2488static inline size_t next_token(const char **buf)
2489{
2490 /*
2491 * These are the characters that produce nonzero for
2492 * isspace() in the "C" and "POSIX" locales.
2493 */
2494 const char *spaces = " \f\n\r\t\v";
2495
2496 *buf += strspn(*buf, spaces); /* Find start of token */
2497
2498 return strcspn(*buf, spaces); /* Return token length */
2499}
2500
2501/*
2502 * Finds the next token in *buf, and if the provided token buffer is
2503 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002504 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2505 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002506 *
2507 * Returns the length of the token found (not including the '\0').
2508 * Return value will be 0 if no token is found, and it will be >=
2509 * token_size if the token would not fit.
2510 *
Alex Elder593a9e72012-02-07 12:03:37 -06002511 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002512 * found token. Note that this occurs even if the token buffer is
2513 * too small to hold it.
2514 */
2515static inline size_t copy_token(const char **buf,
2516 char *token,
2517 size_t token_size)
2518{
2519 size_t len;
2520
2521 len = next_token(buf);
2522 if (len < token_size) {
2523 memcpy(token, *buf, len);
2524 *(token + len) = '\0';
2525 }
2526 *buf += len;
2527
2528 return len;
2529}
2530
2531/*
Alex Elderea3352f2012-07-09 21:04:23 -05002532 * Finds the next token in *buf, dynamically allocates a buffer big
2533 * enough to hold a copy of it, and copies the token into the new
2534 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2535 * that a duplicate buffer is created even for a zero-length token.
2536 *
2537 * Returns a pointer to the newly-allocated duplicate, or a null
2538 * pointer if memory for the duplicate was not available. If
2539 * the lenp argument is a non-null pointer, the length of the token
2540 * (not including the '\0') is returned in *lenp.
2541 *
2542 * If successful, the *buf pointer will be updated to point beyond
2543 * the end of the found token.
2544 *
2545 * Note: uses GFP_KERNEL for allocation.
2546 */
2547static inline char *dup_token(const char **buf, size_t *lenp)
2548{
2549 char *dup;
2550 size_t len;
2551
2552 len = next_token(buf);
2553 dup = kmalloc(len + 1, GFP_KERNEL);
2554 if (!dup)
2555 return NULL;
2556
2557 memcpy(dup, *buf, len);
2558 *(dup + len) = '\0';
2559 *buf += len;
2560
2561 if (lenp)
2562 *lenp = len;
2563
2564 return dup;
2565}
2566
2567/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002568 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2569 * rbd_md_name, and name fields of the given rbd_dev, based on the
2570 * list of monitor addresses and other options provided via
2571 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2572 * copy of the snapshot name to map if successful, or a
2573 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002574 *
2575 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002576 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002577static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2578 const char *buf,
2579 const char **mon_addrs,
2580 size_t *mon_addrs_size,
2581 char *options,
2582 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002583{
Alex Elderd22f76e2012-07-12 10:46:35 -05002584 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002585 char *err_ptr = ERR_PTR(-EINVAL);
2586 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002587
2588 /* The first four tokens are required */
2589
Alex Elder7ef32142012-02-02 08:13:30 -06002590 len = next_token(&buf);
2591 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002592 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002593 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002594 *mon_addrs = buf;
2595
2596 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002597
Alex Eldere28fff262012-02-02 08:13:30 -06002598 len = copy_token(&buf, options, options_size);
2599 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002600 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002601
Alex Elder3feeb8942012-08-31 17:29:52 -05002602 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002603 rbd_dev->pool_name = dup_token(&buf, NULL);
2604 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002605 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002606
Alex Elder0bed54d2012-07-03 16:01:18 -05002607 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2608 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002609 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002610
Alex Elder3feeb8942012-08-31 17:29:52 -05002611 /* Snapshot name is optional */
2612 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002613 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002614 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2615 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002616 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002617 snap_name = kmalloc(len + 1, GFP_KERNEL);
2618 if (!snap_name)
2619 goto out_err;
2620 memcpy(snap_name, buf, len);
2621 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002622
Alex Elder3feeb8942012-08-31 17:29:52 -05002623dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2624
2625 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002626
2627out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002628 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002629 rbd_dev->image_name = NULL;
2630 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002631 kfree(rbd_dev->pool_name);
2632 rbd_dev->pool_name = NULL;
2633
Alex Elder3feeb8942012-08-31 17:29:52 -05002634 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002635}
2636
Alex Elder589d30e2012-07-10 20:30:11 -05002637/*
2638 * An rbd format 2 image has a unique identifier, distinct from the
2639 * name given to it by the user. Internally, that identifier is
2640 * what's used to specify the names of objects related to the image.
2641 *
2642 * A special "rbd id" object is used to map an rbd image name to its
2643 * id. If that object doesn't exist, then there is no v2 rbd image
2644 * with the supplied name.
2645 *
2646 * This function will record the given rbd_dev's image_id field if
2647 * it can be determined, and in that case will return 0. If any
2648 * errors occur a negative errno will be returned and the rbd_dev's
2649 * image_id field will be unchanged (and should be NULL).
2650 */
2651static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2652{
2653 int ret;
2654 size_t size;
2655 char *object_name;
2656 void *response;
2657 void *p;
2658
2659 /*
2660 * First, see if the format 2 image id file exists, and if
2661 * so, get the image's persistent id from it.
2662 */
2663 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2664 object_name = kmalloc(size, GFP_NOIO);
2665 if (!object_name)
2666 return -ENOMEM;
2667 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2668 dout("rbd id object name is %s\n", object_name);
2669
2670 /* Response will be an encoded string, which includes a length */
2671
2672 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2673 response = kzalloc(size, GFP_NOIO);
2674 if (!response) {
2675 ret = -ENOMEM;
2676 goto out;
2677 }
2678
2679 ret = rbd_req_sync_exec(rbd_dev, object_name,
2680 "rbd", "get_id",
2681 NULL, 0,
2682 response, RBD_IMAGE_ID_LEN_MAX,
2683 CEPH_OSD_FLAG_READ, NULL);
2684 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2685 if (ret < 0)
2686 goto out;
2687
2688 p = response;
2689 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2690 p + RBD_IMAGE_ID_LEN_MAX,
2691 &rbd_dev->image_id_len,
2692 GFP_NOIO);
2693 if (IS_ERR(rbd_dev->image_id)) {
2694 ret = PTR_ERR(rbd_dev->image_id);
2695 rbd_dev->image_id = NULL;
2696 } else {
2697 dout("image_id is %s\n", rbd_dev->image_id);
2698 }
2699out:
2700 kfree(response);
2701 kfree(object_name);
2702
2703 return ret;
2704}
2705
Alex Eldera30b71b2012-07-10 20:30:11 -05002706static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2707{
2708 int ret;
2709 size_t size;
2710
2711 /* Version 1 images have no id; empty string is used */
2712
2713 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2714 if (!rbd_dev->image_id)
2715 return -ENOMEM;
2716 rbd_dev->image_id_len = 0;
2717
2718 /* Record the header object name for this rbd image. */
2719
2720 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2721 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2722 if (!rbd_dev->header_name) {
2723 ret = -ENOMEM;
2724 goto out_err;
2725 }
2726 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2727
2728 /* Populate rbd image metadata */
2729
2730 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2731 if (ret < 0)
2732 goto out_err;
2733 rbd_dev->image_format = 1;
2734
2735 dout("discovered version 1 image, header name is %s\n",
2736 rbd_dev->header_name);
2737
2738 return 0;
2739
2740out_err:
2741 kfree(rbd_dev->header_name);
2742 rbd_dev->header_name = NULL;
2743 kfree(rbd_dev->image_id);
2744 rbd_dev->image_id = NULL;
2745
2746 return ret;
2747}
2748
2749static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2750{
2751 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05002752 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05002753
2754 /*
2755 * Image id was filled in by the caller. Record the header
2756 * object name for this rbd image.
2757 */
2758 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2759 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2760 if (!rbd_dev->header_name)
2761 return -ENOMEM;
2762 sprintf(rbd_dev->header_name, "%s%s",
2763 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05002764
2765 /* Get the size and object order for the image */
2766
2767 ret = rbd_dev_v2_image_size(rbd_dev);
2768 if (ret < 0)
2769 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05002770
2771 /* Get the object prefix (a.k.a. block_name) for the image */
2772
2773 ret = rbd_dev_v2_object_prefix(rbd_dev);
2774 if (ret < 0)
2775 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05002776
2777 /* Get the features for the image */
2778
2779 ret = rbd_dev_v2_features(rbd_dev);
2780 if (ret < 0)
2781 goto out_err;
Alex Eldera30b71b2012-07-10 20:30:11 -05002782 rbd_dev->image_format = 2;
2783
2784 dout("discovered version 2 image, header name is %s\n",
2785 rbd_dev->header_name);
2786
2787 return -ENOTSUPP;
Alex Elder9d475de2012-07-03 16:01:19 -05002788out_err:
2789 kfree(rbd_dev->header_name);
2790 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05002791 kfree(rbd_dev->header.object_prefix);
2792 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05002793
2794 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05002795}
2796
2797/*
2798 * Probe for the existence of the header object for the given rbd
2799 * device. For format 2 images this includes determining the image
2800 * id.
2801 */
2802static int rbd_dev_probe(struct rbd_device *rbd_dev)
2803{
2804 int ret;
2805
2806 /*
2807 * Get the id from the image id object. If it's not a
2808 * format 2 image, we'll get ENOENT back, and we'll assume
2809 * it's a format 1 image.
2810 */
2811 ret = rbd_dev_image_id(rbd_dev);
2812 if (ret)
2813 ret = rbd_dev_v1_probe(rbd_dev);
2814 else
2815 ret = rbd_dev_v2_probe(rbd_dev);
2816 if (ret)
2817 dout("probe failed, returning %d\n", ret);
2818
2819 return ret;
2820}
2821
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002822static ssize_t rbd_add(struct bus_type *bus,
2823 const char *buf,
2824 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002825{
Alex Eldercb8627c2012-07-09 21:04:23 -05002826 char *options;
2827 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002828 const char *mon_addrs = NULL;
2829 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002830 struct ceph_osd_client *osdc;
2831 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05002832 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002833
2834 if (!try_module_get(THIS_MODULE))
2835 return -ENODEV;
2836
Alex Elder27cc2592012-02-02 08:13:30 -06002837 options = kmalloc(count, GFP_KERNEL);
2838 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05002839 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002840 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2841 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05002842 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002843
2844 /* static rbd_device initialization */
2845 spin_lock_init(&rbd_dev->lock);
2846 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002847 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002848 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002849
Alex Eldera725f65e2012-02-02 08:13:30 -06002850 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05002851 snap_name = rbd_add_parse_args(rbd_dev, buf,
2852 &mon_addrs, &mon_addrs_size, options, count);
2853 if (IS_ERR(snap_name)) {
2854 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05002855 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05002856 }
Alex Eldera725f65e2012-02-02 08:13:30 -06002857
Alex Elderf8c38922012-08-10 13:12:07 -07002858 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2859 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002860 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002861
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002862 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002863 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002864 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2865 if (rc < 0)
2866 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002867 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002868
Alex Eldera30b71b2012-07-10 20:30:11 -05002869 rc = rbd_dev_probe(rbd_dev);
2870 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05002871 goto err_out_client;
Alex Eldera30b71b2012-07-10 20:30:11 -05002872 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder05fd6f62012-08-29 17:11:07 -05002873
2874 /* no need to lock here, as rbd_dev is not registered yet */
2875 rc = rbd_dev_snaps_update(rbd_dev);
2876 if (rc)
2877 goto err_out_header;
2878
2879 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
2880 if (rc)
2881 goto err_out_header;
2882
Alex Elder85ae8922012-07-26 23:37:14 -05002883 /* generate unique id: find highest unique id, add one */
2884 rbd_dev_id_get(rbd_dev);
2885
2886 /* Fill in the device name, now that we have its id. */
2887 BUILD_BUG_ON(DEV_NAME_LEN
2888 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2889 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2890
2891 /* Get our block major device number. */
2892
Alex Elder27cc2592012-02-02 08:13:30 -06002893 rc = register_blkdev(0, rbd_dev->name);
2894 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002895 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06002896 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002897
Alex Elder0f308a32012-08-29 17:11:07 -05002898 /* Set up the blkdev mapping. */
2899
2900 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002901 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002902 goto err_out_blkdev;
2903
Alex Elder0f308a32012-08-29 17:11:07 -05002904 rc = rbd_bus_add_dev(rbd_dev);
2905 if (rc)
2906 goto err_out_disk;
2907
Alex Elder32eec682012-02-08 16:11:14 -06002908 /*
2909 * At this point cleanup in the event of an error is the job
2910 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06002911 */
Alex Elder2ac4e752012-07-10 20:30:10 -05002912
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002913 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05002914 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002915 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05002916 if (rc)
2917 goto err_out_bus;
2918
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002919 rc = rbd_init_watch_dev(rbd_dev);
2920 if (rc)
2921 goto err_out_bus;
2922
Alex Elder3ee40012012-08-29 17:11:07 -05002923 /* Everything's ready. Announce the disk to the world. */
2924
2925 add_disk(rbd_dev->disk);
2926
2927 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2928 (unsigned long long) rbd_dev->mapping.size);
2929
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002930 return count;
2931
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002932err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002933 /* this will also clean up rest of rbd_dev stuff */
2934
2935 rbd_bus_del_dev(rbd_dev);
2936 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002937 return rc;
2938
Alex Elder0f308a32012-08-29 17:11:07 -05002939err_out_disk:
2940 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002941err_out_blkdev:
2942 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05002943err_out_id:
2944 rbd_dev_id_put(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05002945err_out_header:
2946 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002947err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05002948 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002949 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05002950 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05002951err_out_args:
2952 kfree(rbd_dev->mapping.snap_name);
2953 kfree(rbd_dev->image_name);
2954 kfree(rbd_dev->pool_name);
2955err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06002956 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002957 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002958
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002959 dout("Error adding device %s\n", buf);
2960 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002961
2962 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002963}
2964
Alex Elderde71a292012-07-03 16:01:19 -05002965static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002966{
2967 struct list_head *tmp;
2968 struct rbd_device *rbd_dev;
2969
Alex Eldere124a82f2012-01-29 13:57:44 -06002970 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002971 list_for_each(tmp, &rbd_dev_list) {
2972 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002973 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06002974 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002975 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002976 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002977 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002978 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002979 return NULL;
2980}
2981
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002982static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002983{
Alex Elder593a9e72012-02-07 12:03:37 -06002984 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002985
Alex Elder1dbb4392012-01-24 10:08:37 -06002986 if (rbd_dev->watch_request) {
2987 struct ceph_client *client = rbd_dev->rbd_client->client;
2988
2989 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002990 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002991 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002992 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002993 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002994
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002995 rbd_put_client(rbd_dev);
2996
2997 /* clean up and free blkdev */
2998 rbd_free_disk(rbd_dev);
2999 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003000
Alex Elder2ac4e752012-07-10 20:30:10 -05003001 /* release allocated disk header fields */
3002 rbd_header_free(&rbd_dev->header);
3003
Alex Elder32eec682012-02-08 16:11:14 -06003004 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05003005 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003006 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05003007 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05003008 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05003009 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05003010 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003011 kfree(rbd_dev);
3012
3013 /* release module ref */
3014 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003015}
3016
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003017static ssize_t rbd_remove(struct bus_type *bus,
3018 const char *buf,
3019 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003020{
3021 struct rbd_device *rbd_dev = NULL;
3022 int target_id, rc;
3023 unsigned long ul;
3024 int ret = count;
3025
3026 rc = strict_strtoul(buf, 10, &ul);
3027 if (rc)
3028 return rc;
3029
3030 /* convert to int; abort if we lost anything in the conversion */
3031 target_id = (int) ul;
3032 if (target_id != ul)
3033 return -EINVAL;
3034
3035 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3036
3037 rbd_dev = __rbd_get_dev(target_id);
3038 if (!rbd_dev) {
3039 ret = -ENOENT;
3040 goto done;
3041 }
3042
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003043 __rbd_remove_all_snaps(rbd_dev);
3044 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003045
3046done:
3047 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003048
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003049 return ret;
3050}
3051
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003052/*
3053 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003054 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003055 */
3056static int rbd_sysfs_init(void)
3057{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003058 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003059
Alex Elderfed4c142012-02-07 12:03:36 -06003060 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003061 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003062 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003063
Alex Elderfed4c142012-02-07 12:03:36 -06003064 ret = bus_register(&rbd_bus_type);
3065 if (ret < 0)
3066 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003067
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003068 return ret;
3069}
3070
3071static void rbd_sysfs_cleanup(void)
3072{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003073 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003074 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003075}
3076
3077int __init rbd_init(void)
3078{
3079 int rc;
3080
3081 rc = rbd_sysfs_init();
3082 if (rc)
3083 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003084 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003085 return 0;
3086}
3087
3088void __exit rbd_exit(void)
3089{
3090 rbd_sysfs_cleanup();
3091}
3092
3093module_init(rbd_init);
3094module_exit(rbd_exit);
3095
3096MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3097MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3098MODULE_DESCRIPTION("rados block device");
3099
3100/* following authorship retained from original osdblk.c */
3101MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3102
3103MODULE_LICENSE("GPL");