blob: 366a3a1f2aac6c58e01d309c561bdf1e5ca606bf [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder589d30e2012-07-10 20:30:11 -050069#define RBD_IMAGE_ID_LEN_MAX 64
70
Alex Elder81a89792012-02-02 08:13:30 -060071/*
72 * An RBD device name will be "rbd#", where the "rbd" comes from
73 * RBD_DRV_NAME above, and # is a unique integer identifier.
74 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
75 * enough to hold all possible device names.
76 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070077#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060078#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070079
Alex Eldercc0538b2012-08-10 13:12:07 -070080#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070081
Yehuda Sadeh602adf42010-08-12 16:11:25 -070082/*
83 * block device image metadata (in-memory version)
84 */
85struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050086 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050087 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -050088 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070089 __u8 obj_order;
90 __u8 crypt_type;
91 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Elderf84344f2012-08-31 17:29:51 -050093 /* The remaining fields need to be updated occasionally */
94 u64 image_size;
95 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096 char *snap_names;
97 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070098
99 u64 obj_version;
100};
101
102struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700103 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104};
105
106/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600107 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108 */
109struct rbd_client {
110 struct ceph_client *client;
111 struct kref kref;
112 struct list_head node;
113};
114
115/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600116 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700118struct rbd_req_status {
119 int done;
120 int rc;
121 u64 bytes;
122};
123
124/*
125 * a collection of requests
126 */
127struct rbd_req_coll {
128 int total;
129 int num_done;
130 struct kref kref;
131 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700132};
133
Alex Elderf0f8cef2012-01-29 13:57:44 -0600134/*
135 * a single io request
136 */
137struct rbd_request {
138 struct request *rq; /* blk layer request */
139 struct bio *bio; /* cloned bio */
140 struct page **pages; /* list of used pages */
141 u64 len;
142 int coll_index;
143 struct rbd_req_coll *coll;
144};
145
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800146struct rbd_snap {
147 struct device dev;
148 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800149 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800150 struct list_head node;
151 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500152 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800153};
154
Alex Elderf84344f2012-08-31 17:29:51 -0500155struct rbd_mapping {
156 char *snap_name;
157 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500158 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500159 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500160 bool snap_exists;
161 bool read_only;
162};
163
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700164/*
165 * a single device
166 */
167struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500168 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
170 int major; /* blkdev assigned major */
171 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700172
Alex Elderf8c38922012-08-10 13:12:07 -0700173 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700174 struct rbd_client *rbd_client;
175
176 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
177
178 spinlock_t lock; /* queue lock */
179
180 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500181 char *image_id;
182 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500183 char *image_name;
184 size_t image_name_len;
185 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500186 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500187 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700189 struct ceph_osd_event *watch_event;
190 struct ceph_osd_request *watch_request;
191
Josh Durginc6666012011-11-21 17:11:12 -0800192 /* protects updating the header */
193 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500194
195 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700196
197 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800198
199 /* list of snapshots */
200 struct list_head snaps;
201
202 /* sysfs related */
203 struct device dev;
204};
205
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700206static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600207
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700208static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600209static DEFINE_SPINLOCK(rbd_dev_list_lock);
210
Alex Elder432b8582012-01-29 13:57:44 -0600211static LIST_HEAD(rbd_client_list); /* clients */
212static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700213
Alex Elder304f6802012-08-31 17:29:52 -0500214static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
215static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
216
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800217static void rbd_dev_release(struct device *dev);
Alex Elder14e70852012-07-19 09:09:27 -0500218static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800219
Alex Elderf0f8cef2012-01-29 13:57:44 -0600220static ssize_t rbd_add(struct bus_type *bus, const char *buf,
221 size_t count);
222static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
223 size_t count);
224
225static struct bus_attribute rbd_bus_attrs[] = {
226 __ATTR(add, S_IWUSR, NULL, rbd_add),
227 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
228 __ATTR_NULL
229};
230
231static struct bus_type rbd_bus_type = {
232 .name = "rbd",
233 .bus_attrs = rbd_bus_attrs,
234};
235
236static void rbd_root_dev_release(struct device *dev)
237{
238}
239
240static struct device rbd_root_dev = {
241 .init_name = "rbd",
242 .release = rbd_root_dev_release,
243};
244
Alex Elderaafb230e2012-09-06 16:00:54 -0500245#ifdef RBD_DEBUG
246#define rbd_assert(expr) \
247 if (unlikely(!(expr))) { \
248 printk(KERN_ERR "\nAssertion failure in %s() " \
249 "at line %d:\n\n" \
250 "\trbd_assert(%s);\n\n", \
251 __func__, __LINE__, #expr); \
252 BUG(); \
253 }
254#else /* !RBD_DEBUG */
255# define rbd_assert(expr) ((void) 0)
256#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800257
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
259{
260 return get_device(&rbd_dev->dev);
261}
262
263static void rbd_put_dev(struct rbd_device *rbd_dev)
264{
265 put_device(&rbd_dev->dev);
266}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267
Alex Elder1fe5e992012-07-25 09:32:41 -0500268static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700269
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270static int rbd_open(struct block_device *bdev, fmode_t mode)
271{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600272 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700273
Alex Elderf84344f2012-08-31 17:29:51 -0500274 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700275 return -EROFS;
276
Alex Elder340c7a22012-08-10 13:12:07 -0700277 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500278 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700279
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700280 return 0;
281}
282
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800283static int rbd_release(struct gendisk *disk, fmode_t mode)
284{
285 struct rbd_device *rbd_dev = disk->private_data;
286
287 rbd_put_dev(rbd_dev);
288
289 return 0;
290}
291
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700292static const struct block_device_operations rbd_bd_ops = {
293 .owner = THIS_MODULE,
294 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800295 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296};
297
298/*
299 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500300 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700301 */
Alex Elderf8c38922012-08-10 13:12:07 -0700302static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700303{
304 struct rbd_client *rbdc;
305 int ret = -ENOMEM;
306
307 dout("rbd_client_create\n");
308 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
309 if (!rbdc)
310 goto out_opt;
311
312 kref_init(&rbdc->kref);
313 INIT_LIST_HEAD(&rbdc->node);
314
Alex Elderbc534d82012-01-29 13:57:44 -0600315 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
316
Alex Elder43ae4702012-07-03 16:01:18 -0500317 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600319 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500320 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700321
322 ret = ceph_open_session(rbdc->client);
323 if (ret < 0)
324 goto out_err;
325
Alex Elder432b8582012-01-29 13:57:44 -0600326 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600328 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700329
Alex Elderbc534d82012-01-29 13:57:44 -0600330 mutex_unlock(&ctl_mutex);
331
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332 dout("rbd_client_create created %p\n", rbdc);
333 return rbdc;
334
335out_err:
336 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600337out_mutex:
338 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 kfree(rbdc);
340out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500341 if (ceph_opts)
342 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400343 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344}
345
346/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700347 * Find a ceph client with specific addr and configuration. If
348 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700350static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351{
352 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700353 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354
Alex Elder43ae4702012-07-03 16:01:18 -0500355 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356 return NULL;
357
Alex Elder1f7ba332012-08-10 13:12:07 -0700358 spin_lock(&rbd_client_list_lock);
359 list_for_each_entry(client_node, &rbd_client_list, node) {
360 if (!ceph_compare_options(ceph_opts, client_node->client)) {
361 kref_get(&client_node->kref);
362 found = true;
363 break;
364 }
365 }
366 spin_unlock(&rbd_client_list_lock);
367
368 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369}
370
371/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700372 * mount options
373 */
374enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700375 Opt_last_int,
376 /* int args above */
377 Opt_last_string,
378 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700379 Opt_read_only,
380 Opt_read_write,
381 /* Boolean args above */
382 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700383};
384
Alex Elder43ae4702012-07-03 16:01:18 -0500385static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700386 /* int args above */
387 /* string args above */
Alex Elderf84344f2012-08-31 17:29:51 -0500388 {Opt_read_only, "mapping.read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700389 {Opt_read_only, "ro"}, /* Alternate spelling */
390 {Opt_read_write, "read_write"},
391 {Opt_read_write, "rw"}, /* Alternate spelling */
392 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700393 {-1, NULL}
394};
395
396static int parse_rbd_opts_token(char *c, void *private)
397{
Alex Elder43ae4702012-07-03 16:01:18 -0500398 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700399 substring_t argstr[MAX_OPT_ARGS];
400 int token, intval, ret;
401
Alex Elder43ae4702012-07-03 16:01:18 -0500402 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700403 if (token < 0)
404 return -EINVAL;
405
406 if (token < Opt_last_int) {
407 ret = match_int(&argstr[0], &intval);
408 if (ret < 0) {
409 pr_err("bad mount option arg (not int) "
410 "at '%s'\n", c);
411 return ret;
412 }
413 dout("got int token %d val %d\n", token, intval);
414 } else if (token > Opt_last_int && token < Opt_last_string) {
415 dout("got string token %d val %s\n", token,
416 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700417 } else if (token > Opt_last_string && token < Opt_last_bool) {
418 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700419 } else {
420 dout("got token %d\n", token);
421 }
422
423 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700424 case Opt_read_only:
425 rbd_opts->read_only = true;
426 break;
427 case Opt_read_write:
428 rbd_opts->read_only = false;
429 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700430 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500431 rbd_assert(false);
432 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700433 }
434 return 0;
435}
436
437/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438 * Get a ceph client with specific addr and configuration, if one does
439 * not exist create it.
440 */
Alex Elderf8c38922012-08-10 13:12:07 -0700441static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
442 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443{
Alex Elderf8c38922012-08-10 13:12:07 -0700444 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500445 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700446 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700447
Alex Eldercc0538b2012-08-10 13:12:07 -0700448 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700449
Alex Elder43ae4702012-07-03 16:01:18 -0500450 ceph_opts = ceph_parse_options(options, mon_addr,
451 mon_addr + mon_addr_len,
452 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700453 if (IS_ERR(ceph_opts))
454 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700455
Alex Elder1f7ba332012-08-10 13:12:07 -0700456 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600458 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500459 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700460 } else {
461 rbdc = rbd_client_create(ceph_opts);
462 if (IS_ERR(rbdc))
463 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700464 }
Alex Elderf8c38922012-08-10 13:12:07 -0700465 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700466
Alex Elderf8c38922012-08-10 13:12:07 -0700467 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468}
469
470/*
471 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600472 *
Alex Elder432b8582012-01-29 13:57:44 -0600473 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700474 */
475static void rbd_client_release(struct kref *kref)
476{
477 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
478
479 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500480 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500482 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483
484 ceph_destroy_client(rbdc->client);
485 kfree(rbdc);
486}
487
488/*
489 * Drop reference to ceph client node. If it's not referenced anymore, release
490 * it.
491 */
492static void rbd_put_client(struct rbd_device *rbd_dev)
493{
494 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
495 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700496}
497
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700498/*
499 * Destroy requests collection
500 */
501static void rbd_coll_release(struct kref *kref)
502{
503 struct rbd_req_coll *coll =
504 container_of(kref, struct rbd_req_coll, kref);
505
506 dout("rbd_coll_release %p\n", coll);
507 kfree(coll);
508}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700509
Alex Elder8e94af82012-07-25 09:32:40 -0500510static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
511{
Alex Elder103a1502012-08-02 11:29:45 -0500512 size_t size;
513 u32 snap_count;
514
515 /* The header has to start with the magic rbd header text */
516 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
517 return false;
518
519 /*
520 * The size of a snapshot header has to fit in a size_t, and
521 * that limits the number of snapshots.
522 */
523 snap_count = le32_to_cpu(ondisk->snap_count);
524 size = SIZE_MAX - sizeof (struct ceph_snap_context);
525 if (snap_count > size / sizeof (__le64))
526 return false;
527
528 /*
529 * Not only that, but the size of the entire the snapshot
530 * header must also be representable in a size_t.
531 */
532 size -= snap_count * sizeof (__le64);
533 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
534 return false;
535
536 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500537}
538
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700539/*
540 * Create a new header structure, translate header format from the on-disk
541 * header.
542 */
543static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500544 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545{
Alex Elderccece232012-07-10 20:30:10 -0500546 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500547 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500548 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500549 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700550
Alex Elder6a523252012-07-19 17:12:59 -0500551 memset(header, 0, sizeof (*header));
552
Alex Elder103a1502012-08-02 11:29:45 -0500553 snap_count = le32_to_cpu(ondisk->snap_count);
554
Alex Elder58c17b02012-08-23 23:22:06 -0500555 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
556 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500557 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500559 memcpy(header->object_prefix, ondisk->object_prefix, len);
560 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600561
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500563 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
564
Alex Elder621901d2012-08-23 23:22:06 -0500565 /* Save a copy of the snapshot names */
566
Alex Elderf785cc12012-08-23 23:22:06 -0500567 if (snap_names_len > (u64) SIZE_MAX)
568 return -EIO;
569 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500571 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500572 /*
573 * Note that rbd_dev_v1_header_read() guarantees
574 * the ondisk buffer we're working with has
575 * snap_names_len bytes beyond the end of the
576 * snapshot id array, this memcpy() is safe.
577 */
578 memcpy(header->snap_names, &ondisk->snaps[snap_count],
579 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500580
Alex Elder621901d2012-08-23 23:22:06 -0500581 /* Record each snapshot's size */
582
Alex Elderd2bb24e2012-07-26 23:37:14 -0500583 size = snap_count * sizeof (*header->snap_sizes);
584 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500586 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500587 for (i = 0; i < snap_count; i++)
588 header->snap_sizes[i] =
589 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590 } else {
Alex Elderccece232012-07-10 20:30:10 -0500591 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592 header->snap_names = NULL;
593 header->snap_sizes = NULL;
594 }
Alex Elder849b4262012-07-09 21:04:24 -0500595
Alex Elder34b13182012-07-13 20:35:12 -0500596 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597 header->obj_order = ondisk->options.order;
598 header->crypt_type = ondisk->options.crypt_type;
599 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500600
Alex Elder621901d2012-08-23 23:22:06 -0500601 /* Allocate and fill in the snapshot context */
602
Alex Elderf84344f2012-08-31 17:29:51 -0500603 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500604 size = sizeof (struct ceph_snap_context);
605 size += snap_count * sizeof (header->snapc->snaps[0]);
606 header->snapc = kzalloc(size, GFP_KERNEL);
607 if (!header->snapc)
608 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609
610 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500611 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500613 for (i = 0; i < snap_count; i++)
614 header->snapc->snaps[i] =
615 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616
617 return 0;
618
Alex Elder6a523252012-07-19 17:12:59 -0500619out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500620 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500621 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500623 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500624 kfree(header->object_prefix);
625 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500626
Alex Elder00f1f362012-02-07 12:03:36 -0600627 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700628}
629
Alex Elder8836b992012-08-30 14:42:15 -0500630static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632
Alex Eldere86924a2012-07-10 20:30:11 -0500633 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600634
Alex Eldere86924a2012-07-10 20:30:11 -0500635 list_for_each_entry(snap, &rbd_dev->snaps, node) {
636 if (!strcmp(snap_name, snap->name)) {
637 rbd_dev->mapping.snap_id = snap->id;
638 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500639 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600640
Alex Eldere86924a2012-07-10 20:30:11 -0500641 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600642 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643 }
Alex Eldere86924a2012-07-10 20:30:11 -0500644
Alex Elder00f1f362012-02-07 12:03:36 -0600645 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646}
647
Alex Elder5ed16172012-08-29 17:11:07 -0500648static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700649{
Alex Elder78dc4472012-07-19 08:49:18 -0500650 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651
Alex Elder4e1105a2012-08-31 17:29:52 -0500652 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800653 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500654 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500655 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500656 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500657 rbd_dev->mapping.snap_exists = false;
658 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Alex Eldere86924a2012-07-10 20:30:11 -0500659 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500661 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662 if (ret < 0)
663 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500664 rbd_dev->mapping.snap_exists = true;
665 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500667 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 return ret;
670}
671
672static void rbd_header_free(struct rbd_image_header *header)
673{
Alex Elder849b4262012-07-09 21:04:24 -0500674 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500675 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700676 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500677 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500678 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500679 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800680 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500681 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682}
683
Alex Elder65ccfe22012-08-09 10:33:26 -0700684static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685{
Alex Elder65ccfe22012-08-09 10:33:26 -0700686 char *name;
687 u64 segment;
688 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689
Alex Elder65ccfe22012-08-09 10:33:26 -0700690 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
691 if (!name)
692 return NULL;
693 segment = offset >> rbd_dev->header.obj_order;
694 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
695 rbd_dev->header.object_prefix, segment);
696 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
697 pr_err("error formatting segment name for #%llu (%d)\n",
698 segment, ret);
699 kfree(name);
700 name = NULL;
701 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702
Alex Elder65ccfe22012-08-09 10:33:26 -0700703 return name;
704}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705
Alex Elder65ccfe22012-08-09 10:33:26 -0700706static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
707{
708 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709
Alex Elder65ccfe22012-08-09 10:33:26 -0700710 return offset & (segment_size - 1);
711}
712
713static u64 rbd_segment_length(struct rbd_device *rbd_dev,
714 u64 offset, u64 length)
715{
716 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
717
718 offset &= segment_size - 1;
719
Alex Elderaafb230e2012-09-06 16:00:54 -0500720 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700721 if (offset + length > segment_size)
722 length = segment_size - offset;
723
724 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700725}
726
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700727static int rbd_get_num_segments(struct rbd_image_header *header,
728 u64 ofs, u64 len)
729{
Alex Elderdf111be2012-08-09 10:33:26 -0700730 u64 start_seg;
731 u64 end_seg;
732
733 if (!len)
734 return 0;
735 if (len - 1 > U64_MAX - ofs)
736 return -ERANGE;
737
738 start_seg = ofs >> header->obj_order;
739 end_seg = (ofs + len - 1) >> header->obj_order;
740
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700741 return end_seg - start_seg + 1;
742}
743
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700745 * returns the size of an object in the image
746 */
747static u64 rbd_obj_bytes(struct rbd_image_header *header)
748{
749 return 1 << header->obj_order;
750}
751
752/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753 * bio helpers
754 */
755
756static void bio_chain_put(struct bio *chain)
757{
758 struct bio *tmp;
759
760 while (chain) {
761 tmp = chain;
762 chain = chain->bi_next;
763 bio_put(tmp);
764 }
765}
766
767/*
768 * zeros a bio chain, starting at specific offset
769 */
770static void zero_bio_chain(struct bio *chain, int start_ofs)
771{
772 struct bio_vec *bv;
773 unsigned long flags;
774 void *buf;
775 int i;
776 int pos = 0;
777
778 while (chain) {
779 bio_for_each_segment(bv, chain, i) {
780 if (pos + bv->bv_len > start_ofs) {
781 int remainder = max(start_ofs - pos, 0);
782 buf = bvec_kmap_irq(bv, &flags);
783 memset(buf + remainder, 0,
784 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200785 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786 }
787 pos += bv->bv_len;
788 }
789
790 chain = chain->bi_next;
791 }
792}
793
794/*
795 * bio_chain_clone - clone a chain of bios up to a certain length.
796 * might return a bio_pair that will need to be released.
797 */
798static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
799 struct bio_pair **bp,
800 int len, gfp_t gfpmask)
801{
Alex Elder542582f2012-08-09 10:33:25 -0700802 struct bio *old_chain = *old;
803 struct bio *new_chain = NULL;
804 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700805 int total = 0;
806
807 if (*bp) {
808 bio_pair_release(*bp);
809 *bp = NULL;
810 }
811
812 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700813 struct bio *tmp;
814
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700815 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
816 if (!tmp)
817 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700818 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700819
820 if (total + old_chain->bi_size > len) {
821 struct bio_pair *bp;
822
823 /*
824 * this split can only happen with a single paged bio,
825 * split_bio will BUG_ON if this is not the case
826 */
827 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500828 "bi_size=%u\n",
829 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700830
831 /* split the bio. We'll release it either in the next
832 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600833 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834 if (!bp)
835 goto err_out;
836
837 __bio_clone(tmp, &bp->bio1);
838
839 *next = &bp->bio2;
840 } else {
841 __bio_clone(tmp, old_chain);
842 *next = old_chain->bi_next;
843 }
844
845 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700847 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700848 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700849 else
850 new_chain = tmp;
851 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700852 old_chain = old_chain->bi_next;
853
854 total += tmp->bi_size;
855 }
856
Alex Elderaafb230e2012-09-06 16:00:54 -0500857 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700858
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859 *old = old_chain;
860
861 return new_chain;
862
863err_out:
864 dout("bio_chain_clone with err\n");
865 bio_chain_put(new_chain);
866 return NULL;
867}
868
869/*
870 * helpers for osd request op vectors.
871 */
Alex Elder57cfc102012-06-26 12:57:03 -0700872static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
873 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874{
Alex Elder57cfc102012-06-26 12:57:03 -0700875 struct ceph_osd_req_op *ops;
876
877 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
878 if (!ops)
879 return NULL;
880
881 ops[0].op = opcode;
882
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700883 /*
884 * op extent offset and length will be set later on
885 * in calc_raw_layout()
886 */
Alex Elder57cfc102012-06-26 12:57:03 -0700887 ops[0].payload_len = payload_len;
888
889 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700890}
891
892static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
893{
894 kfree(ops);
895}
896
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700897static void rbd_coll_end_req_index(struct request *rq,
898 struct rbd_req_coll *coll,
899 int index,
900 int ret, u64 len)
901{
902 struct request_queue *q;
903 int min, max, i;
904
Alex Elderbd919d42012-07-13 20:35:11 -0500905 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
906 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700907
908 if (!rq)
909 return;
910
911 if (!coll) {
912 blk_end_request(rq, ret, len);
913 return;
914 }
915
916 q = rq->q;
917
918 spin_lock_irq(q->queue_lock);
919 coll->status[index].done = 1;
920 coll->status[index].rc = ret;
921 coll->status[index].bytes = len;
922 max = min = coll->num_done;
923 while (max < coll->total && coll->status[max].done)
924 max++;
925
926 for (i = min; i<max; i++) {
927 __blk_end_request(rq, coll->status[i].rc,
928 coll->status[i].bytes);
929 coll->num_done++;
930 kref_put(&coll->kref, rbd_coll_release);
931 }
932 spin_unlock_irq(q->queue_lock);
933}
934
935static void rbd_coll_end_req(struct rbd_request *req,
936 int ret, u64 len)
937{
938 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
939}
940
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700941/*
942 * Send ceph osd request
943 */
944static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500945 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700946 struct ceph_snap_context *snapc,
947 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500948 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949 struct bio *bio,
950 struct page **pages,
951 int num_pages,
952 int flags,
953 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700954 struct rbd_req_coll *coll,
955 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700957 struct ceph_msg *msg),
958 struct ceph_osd_request **linger_req,
959 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700960{
961 struct ceph_osd_request *req;
962 struct ceph_file_layout *layout;
963 int ret;
964 u64 bno;
965 struct timespec mtime = CURRENT_TIME;
966 struct rbd_request *req_data;
967 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600968 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700969
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700970 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700971 if (!req_data) {
972 if (coll)
973 rbd_coll_end_req_index(rq, coll, coll_index,
974 -ENOMEM, len);
975 return -ENOMEM;
976 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700977
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700978 if (coll) {
979 req_data->coll = coll;
980 req_data->coll_index = coll_index;
981 }
982
Alex Elderbd919d42012-07-13 20:35:11 -0500983 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
984 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985
Alex Elder0ce1a792012-07-03 16:01:18 -0500986 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600987 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
988 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700989 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700990 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700991 goto done_pages;
992 }
993
994 req->r_callback = rbd_cb;
995
996 req_data->rq = rq;
997 req_data->bio = bio;
998 req_data->pages = pages;
999 req_data->len = len;
1000
1001 req->r_priv = req_data;
1002
1003 reqhead = req->r_request->front.iov_base;
1004 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1005
Alex Elderaded07e2012-07-03 16:01:18 -05001006 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001007 req->r_oid_len = strlen(req->r_oid);
1008
1009 layout = &req->r_file_layout;
1010 memset(layout, 0, sizeof(*layout));
1011 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1012 layout->fl_stripe_count = cpu_to_le32(1);
1013 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001014 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001015 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1016 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017
1018 ceph_osdc_build_request(req, ofs, &len,
1019 ops,
1020 snapc,
1021 &mtime,
1022 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001024 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001025 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001026 *linger_req = req;
1027 }
1028
Alex Elder1dbb4392012-01-24 10:08:37 -06001029 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030 if (ret < 0)
1031 goto done_err;
1032
1033 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001034 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001035 if (ver)
1036 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001037 dout("reassert_ver=%llu\n",
1038 (unsigned long long)
1039 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040 ceph_osdc_put_request(req);
1041 }
1042 return ret;
1043
1044done_err:
1045 bio_chain_put(req_data->bio);
1046 ceph_osdc_put_request(req);
1047done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001048 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001049 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001050 return ret;
1051}
1052
1053/*
1054 * Ceph osd op callback
1055 */
1056static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1057{
1058 struct rbd_request *req_data = req->r_priv;
1059 struct ceph_osd_reply_head *replyhead;
1060 struct ceph_osd_op *op;
1061 __s32 rc;
1062 u64 bytes;
1063 int read_op;
1064
1065 /* parse reply */
1066 replyhead = msg->front.iov_base;
1067 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1068 op = (void *)(replyhead + 1);
1069 rc = le32_to_cpu(replyhead->result);
1070 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001071 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001072
Alex Elderbd919d42012-07-13 20:35:11 -05001073 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1074 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001075
1076 if (rc == -ENOENT && read_op) {
1077 zero_bio_chain(req_data->bio, 0);
1078 rc = 0;
1079 } else if (rc == 0 && read_op && bytes < req_data->len) {
1080 zero_bio_chain(req_data->bio, bytes);
1081 bytes = req_data->len;
1082 }
1083
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001084 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001085
1086 if (req_data->bio)
1087 bio_chain_put(req_data->bio);
1088
1089 ceph_osdc_put_request(req);
1090 kfree(req_data);
1091}
1092
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001093static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1094{
1095 ceph_osdc_put_request(req);
1096}
1097
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001098/*
1099 * Do a synchronous ceph osd operation
1100 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001101static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 struct ceph_snap_context *snapc,
1103 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001105 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001106 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001107 u64 ofs, u64 inbound_size,
1108 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001109 struct ceph_osd_request **linger_req,
1110 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001111{
1112 int ret;
1113 struct page **pages;
1114 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001115
Alex Elderaafb230e2012-09-06 16:00:54 -05001116 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117
Alex Elderf8d4de62012-07-03 16:01:19 -05001118 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001119 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001120 if (IS_ERR(pages))
1121 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122
Alex Elder0ce1a792012-07-03 16:01:18 -05001123 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001124 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001125 pages, num_pages,
1126 flags,
1127 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001128 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001129 NULL,
1130 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001131 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001132 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133
Alex Elderf8d4de62012-07-03 16:01:19 -05001134 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1135 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001136
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137done:
1138 ceph_release_page_vector(pages, num_pages);
1139 return ret;
1140}
1141
1142/*
1143 * Do an asynchronous ceph osd operation
1144 */
1145static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001146 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001147 struct ceph_snap_context *snapc,
1148 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001149 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001150 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001151 struct bio *bio,
1152 struct rbd_req_coll *coll,
1153 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154{
1155 char *seg_name;
1156 u64 seg_ofs;
1157 u64 seg_len;
1158 int ret;
1159 struct ceph_osd_req_op *ops;
1160 u32 payload_len;
1161
Alex Elder65ccfe22012-08-09 10:33:26 -07001162 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001163 if (!seg_name)
1164 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001165 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1166 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167
1168 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1169
Alex Elder57cfc102012-06-26 12:57:03 -07001170 ret = -ENOMEM;
1171 ops = rbd_create_rw_ops(1, opcode, payload_len);
1172 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173 goto done;
1174
1175 /* we've taken care of segment sizes earlier when we
1176 cloned the bios. We should never have a segment
1177 truncated at this point */
Alex Elderaafb230e2012-09-06 16:00:54 -05001178 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179
1180 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1181 seg_name, seg_ofs, seg_len,
1182 bio,
1183 NULL, 0,
1184 flags,
1185 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001186 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001187 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001188
1189 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001190done:
1191 kfree(seg_name);
1192 return ret;
1193}
1194
1195/*
1196 * Request async osd write
1197 */
1198static int rbd_req_write(struct request *rq,
1199 struct rbd_device *rbd_dev,
1200 struct ceph_snap_context *snapc,
1201 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001202 struct bio *bio,
1203 struct rbd_req_coll *coll,
1204 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205{
1206 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1207 CEPH_OSD_OP_WRITE,
1208 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001209 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001210}
1211
1212/*
1213 * Request async osd read
1214 */
1215static int rbd_req_read(struct request *rq,
1216 struct rbd_device *rbd_dev,
1217 u64 snapid,
1218 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001219 struct bio *bio,
1220 struct rbd_req_coll *coll,
1221 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222{
1223 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001224 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001225 CEPH_OSD_OP_READ,
1226 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001227 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001228}
1229
1230/*
1231 * Request sync osd read
1232 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001233static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001235 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001236 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001237 char *buf,
1238 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239{
Alex Elder913d2fd2012-06-26 12:57:03 -07001240 struct ceph_osd_req_op *ops;
1241 int ret;
1242
1243 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1244 if (!ops)
1245 return -ENOMEM;
1246
1247 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001248 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001249 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001250 ops, object_name, ofs, len, buf, NULL, ver);
1251 rbd_destroy_ops(ops);
1252
1253 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001254}
1255
1256/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001257 * Request sync osd watch
1258 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001259static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001261 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001262{
1263 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001264 int ret;
1265
Alex Elder57cfc102012-06-26 12:57:03 -07001266 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1267 if (!ops)
1268 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269
Josh Durgina71b8912011-12-05 18:10:44 -08001270 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001271 ops[0].watch.cookie = notify_id;
1272 ops[0].watch.flag = 0;
1273
Alex Elder0ce1a792012-07-03 16:01:18 -05001274 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001275 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001276 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001277 CEPH_OSD_FLAG_READ,
1278 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001279 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001280 rbd_simple_req_cb, 0, NULL);
1281
1282 rbd_destroy_ops(ops);
1283 return ret;
1284}
1285
1286static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1287{
Alex Elder0ce1a792012-07-03 16:01:18 -05001288 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001289 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001290 int rc;
1291
Alex Elder0ce1a792012-07-03 16:01:18 -05001292 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001293 return;
1294
Alex Elderbd919d42012-07-13 20:35:11 -05001295 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1296 rbd_dev->header_name, (unsigned long long) notify_id,
1297 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001298 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001299 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001300 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001301 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001302
Alex Elder7f0a24d2012-07-25 09:32:40 -05001303 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001304}
1305
1306/*
1307 * Request sync osd watch
1308 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001309static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310{
1311 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001312 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001313 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001314
Alex Elder57cfc102012-06-26 12:57:03 -07001315 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1316 if (!ops)
1317 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001318
1319 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001320 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321 if (ret < 0)
1322 goto fail;
1323
Alex Elder0e6f3222012-07-25 09:32:40 -05001324 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001325 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001326 ops[0].watch.flag = 1;
1327
Alex Elder0ce1a792012-07-03 16:01:18 -05001328 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001329 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1331 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001332 rbd_dev->header_name,
1333 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001334 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001335
1336 if (ret < 0)
1337 goto fail_event;
1338
1339 rbd_destroy_ops(ops);
1340 return 0;
1341
1342fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001343 ceph_osdc_cancel_event(rbd_dev->watch_event);
1344 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001345fail:
1346 rbd_destroy_ops(ops);
1347 return ret;
1348}
1349
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001350/*
1351 * Request sync osd unwatch
1352 */
Alex Elder070c6332012-07-25 09:32:41 -05001353static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001354{
1355 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001356 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001357
Alex Elder57cfc102012-06-26 12:57:03 -07001358 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1359 if (!ops)
1360 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001361
1362 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001363 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001364 ops[0].watch.flag = 0;
1365
Alex Elder0ce1a792012-07-03 16:01:18 -05001366 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001367 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001368 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1369 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001370 rbd_dev->header_name,
1371 0, 0, NULL, NULL, NULL);
1372
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001373
1374 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001375 ceph_osdc_cancel_event(rbd_dev->watch_event);
1376 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001377 return ret;
1378}
1379
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001380/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001381 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001382 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001383static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001384 const char *object_name,
1385 const char *class_name,
1386 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001387 const char *outbound,
1388 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001389 char *inbound,
1390 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001391 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001392 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001393{
1394 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001395 int class_name_len = strlen(class_name);
1396 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001397 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001398 int ret;
1399
Alex Elder3cb4a682012-06-26 12:57:03 -07001400 /*
1401 * Any input parameters required by the method we're calling
1402 * will be sent along with the class and method names as
1403 * part of the message payload. That data and its size are
1404 * supplied via the indata and indata_len fields (named from
1405 * the perspective of the server side) in the OSD request
1406 * operation.
1407 */
1408 payload_size = class_name_len + method_name_len + outbound_size;
1409 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001410 if (!ops)
1411 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001412
Alex Elderaded07e2012-07-03 16:01:18 -05001413 ops[0].cls.class_name = class_name;
1414 ops[0].cls.class_len = (__u8) class_name_len;
1415 ops[0].cls.method_name = method_name;
1416 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001417 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001418 ops[0].cls.indata = outbound;
1419 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001420
Alex Elder0ce1a792012-07-03 16:01:18 -05001421 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001422 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001423 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001424 object_name, 0, inbound_size, inbound,
1425 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001426
1427 rbd_destroy_ops(ops);
1428
1429 dout("cls_exec returned %d\n", ret);
1430 return ret;
1431}
1432
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001433static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1434{
1435 struct rbd_req_coll *coll =
1436 kzalloc(sizeof(struct rbd_req_coll) +
1437 sizeof(struct rbd_req_status) * num_reqs,
1438 GFP_ATOMIC);
1439
1440 if (!coll)
1441 return NULL;
1442 coll->total = num_reqs;
1443 kref_init(&coll->kref);
1444 return coll;
1445}
1446
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001447/*
1448 * block device queue callback
1449 */
1450static void rbd_rq_fn(struct request_queue *q)
1451{
1452 struct rbd_device *rbd_dev = q->queuedata;
1453 struct request *rq;
1454 struct bio_pair *bp = NULL;
1455
Alex Elder00f1f362012-02-07 12:03:36 -06001456 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001457 struct bio *bio;
1458 struct bio *rq_bio, *next_bio = NULL;
1459 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001460 unsigned int size;
1461 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001462 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001463 int num_segs, cur_seg = 0;
1464 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001465 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001466
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001467 dout("fetched request\n");
1468
1469 /* filter out block requests we don't understand */
1470 if ((rq->cmd_type != REQ_TYPE_FS)) {
1471 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001472 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473 }
1474
1475 /* deduce our operation (read, write) */
1476 do_write = (rq_data_dir(rq) == WRITE);
1477
1478 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001479 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001481 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001482 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001483 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001484 }
1485
1486 spin_unlock_irq(q->queue_lock);
1487
Josh Durgind1d25642011-12-05 14:03:05 -08001488 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001489
Alex Elderf84344f2012-08-31 17:29:51 -05001490 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1491 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001492 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001493 dout("request for non-existent snapshot");
1494 spin_lock_irq(q->queue_lock);
1495 __blk_end_request_all(rq, -ENXIO);
1496 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001497 }
1498
Josh Durgind1d25642011-12-05 14:03:05 -08001499 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1500
1501 up_read(&rbd_dev->header_rwsem);
1502
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503 dout("%s 0x%x bytes at 0x%llx\n",
1504 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001505 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001506
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001507 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001508 if (num_segs <= 0) {
1509 spin_lock_irq(q->queue_lock);
1510 __blk_end_request_all(rq, num_segs);
1511 ceph_put_snap_context(snapc);
1512 continue;
1513 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514 coll = rbd_alloc_coll(num_segs);
1515 if (!coll) {
1516 spin_lock_irq(q->queue_lock);
1517 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001518 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001519 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001520 }
1521
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522 do {
1523 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001524 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001525 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001526 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1528 op_size, GFP_ATOMIC);
1529 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001530 rbd_coll_end_req_index(rq, coll, cur_seg,
1531 -ENOMEM, op_size);
1532 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533 }
1534
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001535
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001536 /* init OSD command: write or read */
1537 if (do_write)
1538 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001539 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001541 op_size, bio,
1542 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 else
1544 rbd_req_read(rq, rbd_dev,
Alex Elderf84344f2012-08-31 17:29:51 -05001545 rbd_dev->mapping.snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001546 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001547 op_size, bio,
1548 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001549
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001550next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001551 size -= op_size;
1552 ofs += op_size;
1553
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001554 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555 rq_bio = next_bio;
1556 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001557 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558
1559 if (bp)
1560 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001561 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001562
1563 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001564 }
1565}
1566
1567/*
1568 * a queue callback. Makes sure that we don't create a bio that spans across
1569 * multiple osd objects. One exception would be with a single page bios,
1570 * which we handle later at bio_chain_clone
1571 */
1572static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1573 struct bio_vec *bvec)
1574{
1575 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001576 unsigned int chunk_sectors;
1577 sector_t sector;
1578 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001579 int max;
1580
Alex Elder593a9e72012-02-07 12:03:37 -06001581 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1582 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1583 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1584
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001585 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001586 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001587 if (max < 0)
1588 max = 0; /* bio_add cannot handle a negative return */
1589 if (max <= bvec->bv_len && bio_sectors == 0)
1590 return bvec->bv_len;
1591 return max;
1592}
1593
1594static void rbd_free_disk(struct rbd_device *rbd_dev)
1595{
1596 struct gendisk *disk = rbd_dev->disk;
1597
1598 if (!disk)
1599 return;
1600
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001601 if (disk->flags & GENHD_FL_UP)
1602 del_gendisk(disk);
1603 if (disk->queue)
1604 blk_cleanup_queue(disk->queue);
1605 put_disk(disk);
1606}
1607
1608/*
Alex Elder4156d992012-08-02 11:29:46 -05001609 * Read the complete header for the given rbd device.
1610 *
1611 * Returns a pointer to a dynamically-allocated buffer containing
1612 * the complete and validated header. Caller can pass the address
1613 * of a variable that will be filled in with the version of the
1614 * header object at the time it was read.
1615 *
1616 * Returns a pointer-coded errno if a failure occurs.
1617 */
1618static struct rbd_image_header_ondisk *
1619rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1620{
1621 struct rbd_image_header_ondisk *ondisk = NULL;
1622 u32 snap_count = 0;
1623 u64 names_size = 0;
1624 u32 want_count;
1625 int ret;
1626
1627 /*
1628 * The complete header will include an array of its 64-bit
1629 * snapshot ids, followed by the names of those snapshots as
1630 * a contiguous block of NUL-terminated strings. Note that
1631 * the number of snapshots could change by the time we read
1632 * it in, in which case we re-read it.
1633 */
1634 do {
1635 size_t size;
1636
1637 kfree(ondisk);
1638
1639 size = sizeof (*ondisk);
1640 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1641 size += names_size;
1642 ondisk = kmalloc(size, GFP_KERNEL);
1643 if (!ondisk)
1644 return ERR_PTR(-ENOMEM);
1645
1646 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1647 rbd_dev->header_name,
1648 0, size,
1649 (char *) ondisk, version);
1650
1651 if (ret < 0)
1652 goto out_err;
1653 if (WARN_ON((size_t) ret < size)) {
1654 ret = -ENXIO;
1655 pr_warning("short header read for image %s"
1656 " (want %zd got %d)\n",
1657 rbd_dev->image_name, size, ret);
1658 goto out_err;
1659 }
1660 if (!rbd_dev_ondisk_valid(ondisk)) {
1661 ret = -ENXIO;
1662 pr_warning("invalid header for image %s\n",
1663 rbd_dev->image_name);
1664 goto out_err;
1665 }
1666
1667 names_size = le64_to_cpu(ondisk->snap_names_len);
1668 want_count = snap_count;
1669 snap_count = le32_to_cpu(ondisk->snap_count);
1670 } while (snap_count != want_count);
1671
1672 return ondisk;
1673
1674out_err:
1675 kfree(ondisk);
1676
1677 return ERR_PTR(ret);
1678}
1679
1680/*
1681 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001682 */
1683static int rbd_read_header(struct rbd_device *rbd_dev,
1684 struct rbd_image_header *header)
1685{
Alex Elder4156d992012-08-02 11:29:46 -05001686 struct rbd_image_header_ondisk *ondisk;
1687 u64 ver = 0;
1688 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001689
Alex Elder4156d992012-08-02 11:29:46 -05001690 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1691 if (IS_ERR(ondisk))
1692 return PTR_ERR(ondisk);
1693 ret = rbd_header_from_disk(header, ondisk);
1694 if (ret >= 0)
1695 header->obj_version = ver;
1696 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001697
Alex Elder4156d992012-08-02 11:29:46 -05001698 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001699}
1700
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001701static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1702{
1703 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001704 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001705
Alex Eldera0593292012-07-19 09:09:27 -05001706 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001707 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001708}
1709
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001710/*
1711 * only read the first part of the ondisk header, without the snaps info
1712 */
Alex Elderb8136232012-07-25 09:32:41 -05001713static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001714{
1715 int ret;
1716 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001717
1718 ret = rbd_read_header(rbd_dev, &h);
1719 if (ret < 0)
1720 return ret;
1721
Josh Durgina51aa0c2011-12-05 10:35:04 -08001722 down_write(&rbd_dev->header_rwsem);
1723
Sage Weil9db4b3e2011-04-19 22:49:06 -07001724 /* resized? */
Alex Elderf84344f2012-08-31 17:29:51 -05001725 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
Josh Durgin474ef7c2011-11-21 17:13:54 -08001726 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1727
Alex Elder99c1f082012-08-30 14:42:15 -05001728 if (size != (sector_t) rbd_dev->mapping.size) {
1729 dout("setting size to %llu sectors",
1730 (unsigned long long) size);
1731 rbd_dev->mapping.size = (u64) size;
1732 set_capacity(rbd_dev->disk, size);
1733 }
Josh Durgin474ef7c2011-11-21 17:13:54 -08001734 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001735
Alex Elder849b4262012-07-09 21:04:24 -05001736 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001737 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001738 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001739 /* osd requests may still refer to snapc */
1740 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001741
Alex Elderb8136232012-07-25 09:32:41 -05001742 if (hver)
1743 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001744 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001745 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746 rbd_dev->header.snapc = h.snapc;
1747 rbd_dev->header.snap_names = h.snap_names;
1748 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001749 /* Free the extra copy of the object prefix */
1750 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1751 kfree(h.object_prefix);
1752
Alex Elder304f6802012-08-31 17:29:52 -05001753 ret = rbd_dev_snaps_update(rbd_dev);
1754 if (!ret)
1755 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001756
Josh Durginc6666012011-11-21 17:11:12 -08001757 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001758
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001759 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001760}
1761
Alex Elder1fe5e992012-07-25 09:32:41 -05001762static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1763{
1764 int ret;
1765
1766 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1767 ret = __rbd_refresh_header(rbd_dev, hver);
1768 mutex_unlock(&ctl_mutex);
1769
1770 return ret;
1771}
1772
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001773static int rbd_init_disk(struct rbd_device *rbd_dev)
1774{
1775 struct gendisk *disk;
1776 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001777 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001778
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001779 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1781 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001782 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001783
Alex Elderf0f8cef2012-01-29 13:57:44 -06001784 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001785 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786 disk->major = rbd_dev->major;
1787 disk->first_minor = 0;
1788 disk->fops = &rbd_bd_ops;
1789 disk->private_data = rbd_dev;
1790
1791 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001792 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1793 if (!q)
1794 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001795
Alex Elder593a9e72012-02-07 12:03:37 -06001796 /* We use the default size, but let's be explicit about it. */
1797 blk_queue_physical_block_size(q, SECTOR_SIZE);
1798
Josh Durgin029bcbd2011-07-22 11:35:23 -07001799 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001800 segment_size = rbd_obj_bytes(&rbd_dev->header);
1801 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1802 blk_queue_max_segment_size(q, segment_size);
1803 blk_queue_io_min(q, segment_size);
1804 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001805
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001806 blk_queue_merge_bvec(q, rbd_merge_bvec);
1807 disk->queue = q;
1808
1809 q->queuedata = rbd_dev;
1810
1811 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812
Alex Elder12f02942012-08-29 17:11:07 -05001813 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1814
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001815 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001816out_disk:
1817 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001818
1819 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001820}
1821
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001822/*
1823 sysfs
1824*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001825
Alex Elder593a9e72012-02-07 12:03:37 -06001826static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1827{
1828 return container_of(dev, struct rbd_device, dev);
1829}
1830
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001831static ssize_t rbd_size_show(struct device *dev,
1832 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001833{
Alex Elder593a9e72012-02-07 12:03:37 -06001834 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001835 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001836
Josh Durgina51aa0c2011-12-05 10:35:04 -08001837 down_read(&rbd_dev->header_rwsem);
1838 size = get_capacity(rbd_dev->disk);
1839 up_read(&rbd_dev->header_rwsem);
1840
1841 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001842}
1843
Alex Elder34b13182012-07-13 20:35:12 -05001844/*
1845 * Note this shows the features for whatever's mapped, which is not
1846 * necessarily the base image.
1847 */
1848static ssize_t rbd_features_show(struct device *dev,
1849 struct device_attribute *attr, char *buf)
1850{
1851 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1852
1853 return sprintf(buf, "0x%016llx\n",
1854 (unsigned long long) rbd_dev->mapping.features);
1855}
1856
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001857static ssize_t rbd_major_show(struct device *dev,
1858 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001859{
Alex Elder593a9e72012-02-07 12:03:37 -06001860 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001861
1862 return sprintf(buf, "%d\n", rbd_dev->major);
1863}
1864
1865static ssize_t rbd_client_id_show(struct device *dev,
1866 struct device_attribute *attr, char *buf)
1867{
Alex Elder593a9e72012-02-07 12:03:37 -06001868 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001869
Alex Elder1dbb4392012-01-24 10:08:37 -06001870 return sprintf(buf, "client%lld\n",
1871 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001872}
1873
1874static ssize_t rbd_pool_show(struct device *dev,
1875 struct device_attribute *attr, char *buf)
1876{
Alex Elder593a9e72012-02-07 12:03:37 -06001877 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001878
1879 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1880}
1881
Alex Elder9bb2f332012-07-12 10:46:35 -05001882static ssize_t rbd_pool_id_show(struct device *dev,
1883 struct device_attribute *attr, char *buf)
1884{
1885 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1886
1887 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1888}
1889
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001890static ssize_t rbd_name_show(struct device *dev,
1891 struct device_attribute *attr, char *buf)
1892{
Alex Elder593a9e72012-02-07 12:03:37 -06001893 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001894
Alex Elder0bed54d2012-07-03 16:01:18 -05001895 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001896}
1897
Alex Elder589d30e2012-07-10 20:30:11 -05001898static ssize_t rbd_image_id_show(struct device *dev,
1899 struct device_attribute *attr, char *buf)
1900{
1901 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1902
1903 return sprintf(buf, "%s\n", rbd_dev->image_id);
1904}
1905
Alex Elder34b13182012-07-13 20:35:12 -05001906/*
1907 * Shows the name of the currently-mapped snapshot (or
1908 * RBD_SNAP_HEAD_NAME for the base image).
1909 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001910static ssize_t rbd_snap_show(struct device *dev,
1911 struct device_attribute *attr,
1912 char *buf)
1913{
Alex Elder593a9e72012-02-07 12:03:37 -06001914 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001915
Alex Elderf84344f2012-08-31 17:29:51 -05001916 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917}
1918
1919static ssize_t rbd_image_refresh(struct device *dev,
1920 struct device_attribute *attr,
1921 const char *buf,
1922 size_t size)
1923{
Alex Elder593a9e72012-02-07 12:03:37 -06001924 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001925 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001926
Alex Elder1fe5e992012-07-25 09:32:41 -05001927 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001928
1929 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001930}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001931
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001932static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05001933static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001934static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1935static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1936static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001937static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001938static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05001939static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001940static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1941static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001942
1943static struct attribute *rbd_attrs[] = {
1944 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05001945 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001946 &dev_attr_major.attr,
1947 &dev_attr_client_id.attr,
1948 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001949 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001950 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05001951 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001952 &dev_attr_current_snap.attr,
1953 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001954 NULL
1955};
1956
1957static struct attribute_group rbd_attr_group = {
1958 .attrs = rbd_attrs,
1959};
1960
1961static const struct attribute_group *rbd_attr_groups[] = {
1962 &rbd_attr_group,
1963 NULL
1964};
1965
1966static void rbd_sysfs_dev_release(struct device *dev)
1967{
1968}
1969
1970static struct device_type rbd_device_type = {
1971 .name = "rbd",
1972 .groups = rbd_attr_groups,
1973 .release = rbd_sysfs_dev_release,
1974};
1975
1976
1977/*
1978 sysfs - snapshots
1979*/
1980
1981static ssize_t rbd_snap_size_show(struct device *dev,
1982 struct device_attribute *attr,
1983 char *buf)
1984{
1985 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1986
Josh Durgin35915382011-12-05 18:25:13 -08001987 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001988}
1989
1990static ssize_t rbd_snap_id_show(struct device *dev,
1991 struct device_attribute *attr,
1992 char *buf)
1993{
1994 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1995
Josh Durgin35915382011-12-05 18:25:13 -08001996 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001997}
1998
Alex Elder34b13182012-07-13 20:35:12 -05001999static ssize_t rbd_snap_features_show(struct device *dev,
2000 struct device_attribute *attr,
2001 char *buf)
2002{
2003 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2004
2005 return sprintf(buf, "0x%016llx\n",
2006 (unsigned long long) snap->features);
2007}
2008
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002009static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2010static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002011static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002012
2013static struct attribute *rbd_snap_attrs[] = {
2014 &dev_attr_snap_size.attr,
2015 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002016 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002017 NULL,
2018};
2019
2020static struct attribute_group rbd_snap_attr_group = {
2021 .attrs = rbd_snap_attrs,
2022};
2023
2024static void rbd_snap_dev_release(struct device *dev)
2025{
2026 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2027 kfree(snap->name);
2028 kfree(snap);
2029}
2030
2031static const struct attribute_group *rbd_snap_attr_groups[] = {
2032 &rbd_snap_attr_group,
2033 NULL
2034};
2035
2036static struct device_type rbd_snap_device_type = {
2037 .groups = rbd_snap_attr_groups,
2038 .release = rbd_snap_dev_release,
2039};
2040
Alex Elder304f6802012-08-31 17:29:52 -05002041static bool rbd_snap_registered(struct rbd_snap *snap)
2042{
2043 bool ret = snap->dev.type == &rbd_snap_device_type;
2044 bool reg = device_is_registered(&snap->dev);
2045
2046 rbd_assert(!ret ^ reg);
2047
2048 return ret;
2049}
2050
Alex Elder14e70852012-07-19 09:09:27 -05002051static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002052{
2053 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002054 if (device_is_registered(&snap->dev))
2055 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002056}
2057
Alex Elder14e70852012-07-19 09:09:27 -05002058static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002059 struct device *parent)
2060{
2061 struct device *dev = &snap->dev;
2062 int ret;
2063
2064 dev->type = &rbd_snap_device_type;
2065 dev->parent = parent;
2066 dev->release = rbd_snap_dev_release;
2067 dev_set_name(dev, "snap_%s", snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002068 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2069
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002070 ret = device_register(dev);
2071
2072 return ret;
2073}
2074
Alex Elder4e891e02012-07-10 20:30:10 -05002075static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002076 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002077 u64 snap_id, u64 snap_size,
2078 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002079{
Alex Elder4e891e02012-07-10 20:30:10 -05002080 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002081 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002082
2083 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002084 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002085 return ERR_PTR(-ENOMEM);
2086
2087 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002088 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002089 if (!snap->name)
2090 goto err;
2091
Alex Elderc8d18422012-07-10 20:30:11 -05002092 snap->id = snap_id;
2093 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002094 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002095
2096 return snap;
2097
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002098err:
2099 kfree(snap->name);
2100 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002101
2102 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002103}
2104
Alex Eldercd892122012-07-03 16:01:19 -05002105static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2106 u64 *snap_size, u64 *snap_features)
2107{
2108 char *snap_name;
2109
2110 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2111
2112 *snap_size = rbd_dev->header.snap_sizes[which];
2113 *snap_features = 0; /* No features for v1 */
2114
2115 /* Skip over names until we find the one we are looking for */
2116
2117 snap_name = rbd_dev->header.snap_names;
2118 while (which--)
2119 snap_name += strlen(snap_name) + 1;
2120
2121 return snap_name;
2122}
2123
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002124/*
Alex Elder35938152012-08-02 11:29:46 -05002125 * Scan the rbd device's current snapshot list and compare it to the
2126 * newly-received snapshot context. Remove any existing snapshots
2127 * not present in the new snapshot context. Add a new snapshot for
2128 * any snaphots in the snapshot context not in the current list.
2129 * And verify there are no changes to snapshots we already know
2130 * about.
2131 *
2132 * Assumes the snapshots in the snapshot context are sorted by
2133 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2134 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002135 */
Alex Elder304f6802012-08-31 17:29:52 -05002136static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002137{
Alex Elder35938152012-08-02 11:29:46 -05002138 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2139 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002140 struct list_head *head = &rbd_dev->snaps;
2141 struct list_head *links = head->next;
2142 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002143
Alex Elder9fcbb802012-08-23 23:48:49 -05002144 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002145 while (index < snap_count || links != head) {
2146 u64 snap_id;
2147 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002148 char *snap_name;
2149 u64 snap_size = 0;
2150 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002151
Alex Elder35938152012-08-02 11:29:46 -05002152 snap_id = index < snap_count ? snapc->snaps[index]
2153 : CEPH_NOSNAP;
2154 snap = links != head ? list_entry(links, struct rbd_snap, node)
2155 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05002156 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002157
Alex Elder35938152012-08-02 11:29:46 -05002158 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2159 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002160
Alex Elder35938152012-08-02 11:29:46 -05002161 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002162
Alex Elderf84344f2012-08-31 17:29:51 -05002163 if (rbd_dev->mapping.snap_id == snap->id)
2164 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002165 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002166 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002167 rbd_dev->mapping.snap_id == snap->id ?
2168 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002169 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002170
Alex Elder35938152012-08-02 11:29:46 -05002171 /* Done with this list entry; advance */
2172
2173 links = next;
2174 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002175 }
Alex Elder35938152012-08-02 11:29:46 -05002176
Alex Eldercd892122012-07-03 16:01:19 -05002177 snap_name = rbd_dev_v1_snap_info(rbd_dev, index,
2178 &snap_size, &snap_features);
2179 if (IS_ERR(snap_name))
2180 return PTR_ERR(snap_name);
2181
Alex Elder9fcbb802012-08-23 23:48:49 -05002182 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2183 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002184 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2185 struct rbd_snap *new_snap;
2186
2187 /* We haven't seen this snapshot before */
2188
Alex Elderc8d18422012-07-10 20:30:11 -05002189 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002190 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002191 if (IS_ERR(new_snap)) {
2192 int err = PTR_ERR(new_snap);
2193
2194 dout(" failed to add dev, error %d\n", err);
2195
2196 return err;
2197 }
Alex Elder35938152012-08-02 11:29:46 -05002198
2199 /* New goes before existing, or at end of list */
2200
Alex Elder9fcbb802012-08-23 23:48:49 -05002201 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002202 if (snap)
2203 list_add_tail(&new_snap->node, &snap->node);
2204 else
Alex Elder523f3252012-08-30 00:16:37 -05002205 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002206 } else {
2207 /* Already have this one */
2208
Alex Elder9fcbb802012-08-23 23:48:49 -05002209 dout(" already present\n");
2210
Alex Eldercd892122012-07-03 16:01:19 -05002211 rbd_assert(snap->size == snap_size);
Alex Elderaafb230e2012-09-06 16:00:54 -05002212 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002213 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002214
2215 /* Done with this list entry; advance */
2216
2217 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002218 }
Alex Elder35938152012-08-02 11:29:46 -05002219
2220 /* Advance to the next entry in the snapshot context */
2221
2222 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002223 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002224 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002225
2226 return 0;
2227}
2228
Alex Elder304f6802012-08-31 17:29:52 -05002229/*
2230 * Scan the list of snapshots and register the devices for any that
2231 * have not already been registered.
2232 */
2233static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2234{
2235 struct rbd_snap *snap;
2236 int ret = 0;
2237
2238 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002239 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2240 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002241
2242 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2243 if (!rbd_snap_registered(snap)) {
2244 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2245 if (ret < 0)
2246 break;
2247 }
2248 }
2249 dout("%s: returning %d\n", __func__, ret);
2250
2251 return ret;
2252}
2253
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002254static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2255{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002256 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002257 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002258
2259 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002260
Alex Eldercd789ab2012-08-30 00:16:38 -05002261 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002262 dev->bus = &rbd_bus_type;
2263 dev->type = &rbd_device_type;
2264 dev->parent = &rbd_root_dev;
2265 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002266 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002267 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002268
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002269 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002270
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002271 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002272}
2273
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002274static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2275{
2276 device_unregister(&rbd_dev->dev);
2277}
2278
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002279static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2280{
2281 int ret, rc;
2282
2283 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002284 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002285 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002286 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002287 if (rc < 0)
2288 return rc;
2289 }
2290 } while (ret == -ERANGE);
2291
2292 return ret;
2293}
2294
Alex Eldere2839302012-08-29 17:11:06 -05002295static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002296
2297/*
Alex Elder499afd52012-02-02 08:13:29 -06002298 * Get a unique rbd identifier for the given new rbd_dev, and add
2299 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002300 */
Alex Eldere2839302012-08-29 17:11:06 -05002301static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002302{
Alex Eldere2839302012-08-29 17:11:06 -05002303 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002304
2305 spin_lock(&rbd_dev_list_lock);
2306 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2307 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002308 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2309 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002310}
Alex Elderb7f23c32012-01-29 13:57:43 -06002311
Alex Elder1ddbe942012-01-29 13:57:44 -06002312/*
Alex Elder499afd52012-02-02 08:13:29 -06002313 * Remove an rbd_dev from the global list, and record that its
2314 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002315 */
Alex Eldere2839302012-08-29 17:11:06 -05002316static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002317{
Alex Elderd184f6b2012-01-29 13:57:44 -06002318 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002319 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002320 int max_id;
2321
Alex Elderaafb230e2012-09-06 16:00:54 -05002322 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002323
Alex Eldere2839302012-08-29 17:11:06 -05002324 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2325 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002326 spin_lock(&rbd_dev_list_lock);
2327 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002328
2329 /*
2330 * If the id being "put" is not the current maximum, there
2331 * is nothing special we need to do.
2332 */
Alex Eldere2839302012-08-29 17:11:06 -05002333 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002334 spin_unlock(&rbd_dev_list_lock);
2335 return;
2336 }
2337
2338 /*
2339 * We need to update the current maximum id. Search the
2340 * list to find out what it is. We're more likely to find
2341 * the maximum at the end, so search the list backward.
2342 */
2343 max_id = 0;
2344 list_for_each_prev(tmp, &rbd_dev_list) {
2345 struct rbd_device *rbd_dev;
2346
2347 rbd_dev = list_entry(tmp, struct rbd_device, node);
2348 if (rbd_id > max_id)
2349 max_id = rbd_id;
2350 }
Alex Elder499afd52012-02-02 08:13:29 -06002351 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002352
Alex Elder1ddbe942012-01-29 13:57:44 -06002353 /*
Alex Eldere2839302012-08-29 17:11:06 -05002354 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002355 * which case it now accurately reflects the new maximum.
2356 * Be careful not to overwrite the maximum value in that
2357 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002358 */
Alex Eldere2839302012-08-29 17:11:06 -05002359 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2360 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002361}
2362
Alex Eldera725f65e2012-02-02 08:13:30 -06002363/*
Alex Eldere28fff262012-02-02 08:13:30 -06002364 * Skips over white space at *buf, and updates *buf to point to the
2365 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002366 * the token (string of non-white space characters) found. Note
2367 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002368 */
2369static inline size_t next_token(const char **buf)
2370{
2371 /*
2372 * These are the characters that produce nonzero for
2373 * isspace() in the "C" and "POSIX" locales.
2374 */
2375 const char *spaces = " \f\n\r\t\v";
2376
2377 *buf += strspn(*buf, spaces); /* Find start of token */
2378
2379 return strcspn(*buf, spaces); /* Return token length */
2380}
2381
2382/*
2383 * Finds the next token in *buf, and if the provided token buffer is
2384 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002385 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2386 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002387 *
2388 * Returns the length of the token found (not including the '\0').
2389 * Return value will be 0 if no token is found, and it will be >=
2390 * token_size if the token would not fit.
2391 *
Alex Elder593a9e72012-02-07 12:03:37 -06002392 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002393 * found token. Note that this occurs even if the token buffer is
2394 * too small to hold it.
2395 */
2396static inline size_t copy_token(const char **buf,
2397 char *token,
2398 size_t token_size)
2399{
2400 size_t len;
2401
2402 len = next_token(buf);
2403 if (len < token_size) {
2404 memcpy(token, *buf, len);
2405 *(token + len) = '\0';
2406 }
2407 *buf += len;
2408
2409 return len;
2410}
2411
2412/*
Alex Elderea3352f2012-07-09 21:04:23 -05002413 * Finds the next token in *buf, dynamically allocates a buffer big
2414 * enough to hold a copy of it, and copies the token into the new
2415 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2416 * that a duplicate buffer is created even for a zero-length token.
2417 *
2418 * Returns a pointer to the newly-allocated duplicate, or a null
2419 * pointer if memory for the duplicate was not available. If
2420 * the lenp argument is a non-null pointer, the length of the token
2421 * (not including the '\0') is returned in *lenp.
2422 *
2423 * If successful, the *buf pointer will be updated to point beyond
2424 * the end of the found token.
2425 *
2426 * Note: uses GFP_KERNEL for allocation.
2427 */
2428static inline char *dup_token(const char **buf, size_t *lenp)
2429{
2430 char *dup;
2431 size_t len;
2432
2433 len = next_token(buf);
2434 dup = kmalloc(len + 1, GFP_KERNEL);
2435 if (!dup)
2436 return NULL;
2437
2438 memcpy(dup, *buf, len);
2439 *(dup + len) = '\0';
2440 *buf += len;
2441
2442 if (lenp)
2443 *lenp = len;
2444
2445 return dup;
2446}
2447
2448/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002449 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2450 * rbd_md_name, and name fields of the given rbd_dev, based on the
2451 * list of monitor addresses and other options provided via
2452 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2453 * copy of the snapshot name to map if successful, or a
2454 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002455 *
2456 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002457 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002458static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2459 const char *buf,
2460 const char **mon_addrs,
2461 size_t *mon_addrs_size,
2462 char *options,
2463 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002464{
Alex Elderd22f76e2012-07-12 10:46:35 -05002465 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002466 char *err_ptr = ERR_PTR(-EINVAL);
2467 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002468
2469 /* The first four tokens are required */
2470
Alex Elder7ef32142012-02-02 08:13:30 -06002471 len = next_token(&buf);
2472 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002473 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002474 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002475 *mon_addrs = buf;
2476
2477 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002478
Alex Eldere28fff262012-02-02 08:13:30 -06002479 len = copy_token(&buf, options, options_size);
2480 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002481 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002482
Alex Elder3feeb8942012-08-31 17:29:52 -05002483 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002484 rbd_dev->pool_name = dup_token(&buf, NULL);
2485 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002486 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002487
Alex Elder0bed54d2012-07-03 16:01:18 -05002488 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2489 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002490 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002491
Alex Elder3feeb8942012-08-31 17:29:52 -05002492 /* Snapshot name is optional */
2493 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002494 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002495 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2496 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002497 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002498 snap_name = kmalloc(len + 1, GFP_KERNEL);
2499 if (!snap_name)
2500 goto out_err;
2501 memcpy(snap_name, buf, len);
2502 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002503
Alex Elder3feeb8942012-08-31 17:29:52 -05002504dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2505
2506 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002507
2508out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002509 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002510 rbd_dev->image_name = NULL;
2511 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002512 kfree(rbd_dev->pool_name);
2513 rbd_dev->pool_name = NULL;
2514
Alex Elder3feeb8942012-08-31 17:29:52 -05002515 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002516}
2517
Alex Elder589d30e2012-07-10 20:30:11 -05002518/*
2519 * An rbd format 2 image has a unique identifier, distinct from the
2520 * name given to it by the user. Internally, that identifier is
2521 * what's used to specify the names of objects related to the image.
2522 *
2523 * A special "rbd id" object is used to map an rbd image name to its
2524 * id. If that object doesn't exist, then there is no v2 rbd image
2525 * with the supplied name.
2526 *
2527 * This function will record the given rbd_dev's image_id field if
2528 * it can be determined, and in that case will return 0. If any
2529 * errors occur a negative errno will be returned and the rbd_dev's
2530 * image_id field will be unchanged (and should be NULL).
2531 */
2532static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2533{
2534 int ret;
2535 size_t size;
2536 char *object_name;
2537 void *response;
2538 void *p;
2539
2540 /*
2541 * First, see if the format 2 image id file exists, and if
2542 * so, get the image's persistent id from it.
2543 */
2544 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2545 object_name = kmalloc(size, GFP_NOIO);
2546 if (!object_name)
2547 return -ENOMEM;
2548 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2549 dout("rbd id object name is %s\n", object_name);
2550
2551 /* Response will be an encoded string, which includes a length */
2552
2553 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2554 response = kzalloc(size, GFP_NOIO);
2555 if (!response) {
2556 ret = -ENOMEM;
2557 goto out;
2558 }
2559
2560 ret = rbd_req_sync_exec(rbd_dev, object_name,
2561 "rbd", "get_id",
2562 NULL, 0,
2563 response, RBD_IMAGE_ID_LEN_MAX,
2564 CEPH_OSD_FLAG_READ, NULL);
2565 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2566 if (ret < 0)
2567 goto out;
2568
2569 p = response;
2570 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2571 p + RBD_IMAGE_ID_LEN_MAX,
2572 &rbd_dev->image_id_len,
2573 GFP_NOIO);
2574 if (IS_ERR(rbd_dev->image_id)) {
2575 ret = PTR_ERR(rbd_dev->image_id);
2576 rbd_dev->image_id = NULL;
2577 } else {
2578 dout("image_id is %s\n", rbd_dev->image_id);
2579 }
2580out:
2581 kfree(response);
2582 kfree(object_name);
2583
2584 return ret;
2585}
2586
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002587static ssize_t rbd_add(struct bus_type *bus,
2588 const char *buf,
2589 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002590{
Alex Eldercb8627c2012-07-09 21:04:23 -05002591 char *options;
2592 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002593 const char *mon_addrs = NULL;
2594 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002595 struct ceph_osd_client *osdc;
2596 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05002597 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002598
2599 if (!try_module_get(THIS_MODULE))
2600 return -ENODEV;
2601
Alex Elder27cc2592012-02-02 08:13:30 -06002602 options = kmalloc(count, GFP_KERNEL);
2603 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05002604 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002605 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2606 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05002607 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002608
2609 /* static rbd_device initialization */
2610 spin_lock_init(&rbd_dev->lock);
2611 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002612 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002613 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002614
Alex Eldera725f65e2012-02-02 08:13:30 -06002615 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05002616 snap_name = rbd_add_parse_args(rbd_dev, buf,
2617 &mon_addrs, &mon_addrs_size, options, count);
2618 if (IS_ERR(snap_name)) {
2619 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05002620 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05002621 }
Alex Eldera725f65e2012-02-02 08:13:30 -06002622
Alex Elderf8c38922012-08-10 13:12:07 -07002623 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2624 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002625 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002626
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002627 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002628 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002629 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2630 if (rc < 0)
2631 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002632 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002633
Alex Elder589d30e2012-07-10 20:30:11 -05002634 rc = rbd_dev_image_id(rbd_dev);
2635 if (!rc) {
2636 rc = -ENOTSUPP; /* Not actually supporting format 2 yet */
2637 goto err_out_client;
2638 }
2639
2640 /* Version 1 images have no id; empty string is used */
2641
2642 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2643 if (!rbd_dev->image_id) {
2644 rc = -ENOMEM;
2645 goto err_out_client;
2646 }
2647 rbd_dev->image_id_len = 0;
2648
Alex Elder3fcf2582012-07-03 16:01:19 -05002649 /* Create the name of the header object */
2650
2651 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
2652 + sizeof (RBD_SUFFIX),
2653 GFP_KERNEL);
2654 if (!rbd_dev->header_name)
2655 goto err_out_client;
2656 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2657
Alex Elder05fd6f62012-08-29 17:11:07 -05002658 /* Get information about the image being mapped */
2659
2660 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
2661 if (rc)
2662 goto err_out_client;
2663
2664 /* no need to lock here, as rbd_dev is not registered yet */
2665 rc = rbd_dev_snaps_update(rbd_dev);
2666 if (rc)
2667 goto err_out_header;
2668
2669 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
2670 if (rc)
2671 goto err_out_header;
2672
Alex Elder85ae8922012-07-26 23:37:14 -05002673 /* generate unique id: find highest unique id, add one */
2674 rbd_dev_id_get(rbd_dev);
2675
2676 /* Fill in the device name, now that we have its id. */
2677 BUILD_BUG_ON(DEV_NAME_LEN
2678 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2679 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2680
2681 /* Get our block major device number. */
2682
Alex Elder27cc2592012-02-02 08:13:30 -06002683 rc = register_blkdev(0, rbd_dev->name);
2684 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002685 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06002686 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002687
Alex Elder0f308a32012-08-29 17:11:07 -05002688 /* Set up the blkdev mapping. */
2689
2690 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002691 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002692 goto err_out_blkdev;
2693
Alex Elder0f308a32012-08-29 17:11:07 -05002694 rc = rbd_bus_add_dev(rbd_dev);
2695 if (rc)
2696 goto err_out_disk;
2697
Alex Elder32eec682012-02-08 16:11:14 -06002698 /*
2699 * At this point cleanup in the event of an error is the job
2700 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06002701 */
Alex Elder2ac4e752012-07-10 20:30:10 -05002702
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002703 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05002704 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002705 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05002706 if (rc)
2707 goto err_out_bus;
2708
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002709 rc = rbd_init_watch_dev(rbd_dev);
2710 if (rc)
2711 goto err_out_bus;
2712
Alex Elder3ee40012012-08-29 17:11:07 -05002713 /* Everything's ready. Announce the disk to the world. */
2714
2715 add_disk(rbd_dev->disk);
2716
2717 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2718 (unsigned long long) rbd_dev->mapping.size);
2719
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002720 return count;
2721
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002722err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002723 /* this will also clean up rest of rbd_dev stuff */
2724
2725 rbd_bus_del_dev(rbd_dev);
2726 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002727 return rc;
2728
Alex Elder0f308a32012-08-29 17:11:07 -05002729err_out_disk:
2730 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002731err_out_blkdev:
2732 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05002733err_out_id:
2734 rbd_dev_id_put(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05002735err_out_header:
2736 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002737err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05002738 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002739 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05002740 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05002741err_out_args:
2742 kfree(rbd_dev->mapping.snap_name);
2743 kfree(rbd_dev->image_name);
2744 kfree(rbd_dev->pool_name);
2745err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06002746 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002747 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002748
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002749 dout("Error adding device %s\n", buf);
2750 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002751
2752 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002753}
2754
Alex Elderde71a292012-07-03 16:01:19 -05002755static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002756{
2757 struct list_head *tmp;
2758 struct rbd_device *rbd_dev;
2759
Alex Eldere124a822012-01-29 13:57:44 -06002760 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002761 list_for_each(tmp, &rbd_dev_list) {
2762 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002763 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002764 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002765 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002766 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002767 }
Alex Eldere124a822012-01-29 13:57:44 -06002768 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002769 return NULL;
2770}
2771
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002772static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002773{
Alex Elder593a9e72012-02-07 12:03:37 -06002774 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002775
Alex Elder1dbb4392012-01-24 10:08:37 -06002776 if (rbd_dev->watch_request) {
2777 struct ceph_client *client = rbd_dev->rbd_client->client;
2778
2779 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002780 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002781 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002782 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002783 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002784
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002785 rbd_put_client(rbd_dev);
2786
2787 /* clean up and free blkdev */
2788 rbd_free_disk(rbd_dev);
2789 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002790
Alex Elder2ac4e752012-07-10 20:30:10 -05002791 /* release allocated disk header fields */
2792 rbd_header_free(&rbd_dev->header);
2793
Alex Elder32eec682012-02-08 16:11:14 -06002794 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05002795 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05002796 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05002797 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002798 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002799 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05002800 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002801 kfree(rbd_dev);
2802
2803 /* release module ref */
2804 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002805}
2806
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002807static ssize_t rbd_remove(struct bus_type *bus,
2808 const char *buf,
2809 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002810{
2811 struct rbd_device *rbd_dev = NULL;
2812 int target_id, rc;
2813 unsigned long ul;
2814 int ret = count;
2815
2816 rc = strict_strtoul(buf, 10, &ul);
2817 if (rc)
2818 return rc;
2819
2820 /* convert to int; abort if we lost anything in the conversion */
2821 target_id = (int) ul;
2822 if (target_id != ul)
2823 return -EINVAL;
2824
2825 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2826
2827 rbd_dev = __rbd_get_dev(target_id);
2828 if (!rbd_dev) {
2829 ret = -ENOENT;
2830 goto done;
2831 }
2832
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002833 __rbd_remove_all_snaps(rbd_dev);
2834 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002835
2836done:
2837 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05002838
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002839 return ret;
2840}
2841
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002842/*
2843 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002844 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002845 */
2846static int rbd_sysfs_init(void)
2847{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002848 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002849
Alex Elderfed4c142012-02-07 12:03:36 -06002850 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002851 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002852 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002853
Alex Elderfed4c142012-02-07 12:03:36 -06002854 ret = bus_register(&rbd_bus_type);
2855 if (ret < 0)
2856 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002857
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002858 return ret;
2859}
2860
2861static void rbd_sysfs_cleanup(void)
2862{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002863 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002864 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002865}
2866
2867int __init rbd_init(void)
2868{
2869 int rc;
2870
2871 rc = rbd_sysfs_init();
2872 if (rc)
2873 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002874 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002875 return 0;
2876}
2877
2878void __exit rbd_exit(void)
2879{
2880 rbd_sysfs_cleanup();
2881}
2882
2883module_init(rbd_init);
2884module_exit(rbd_exit);
2885
2886MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2887MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2888MODULE_DESCRIPTION("rados block device");
2889
2890/* following authorship retained from original osdblk.c */
2891MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2892
2893MODULE_LICENSE("GPL");