blob: dff621060432fc037d630f0897bd5aaa4d8da116 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder81a89792012-02-02 08:13:30 -060069/*
70 * An RBD device name will be "rbd#", where the "rbd" comes from
71 * RBD_DRV_NAME above, and # is a unique integer identifier.
72 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
73 * enough to hold all possible device names.
74 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070075#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060076#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070077
Alex Eldercc0538b2012-08-10 13:12:07 -070078#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070079
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080/*
81 * block device image metadata (in-memory version)
82 */
83struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050084 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050085 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070086 __u8 obj_order;
87 __u8 crypt_type;
88 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070089
Alex Elderf84344f2012-08-31 17:29:51 -050090 /* The remaining fields need to be updated occasionally */
91 u64 image_size;
92 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093 char *snap_names;
94 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070095
96 u64 obj_version;
97};
98
99struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700100 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101};
102
103/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600104 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 */
106struct rbd_client {
107 struct ceph_client *client;
108 struct kref kref;
109 struct list_head node;
110};
111
112/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600113 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700114 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700115struct rbd_req_status {
116 int done;
117 int rc;
118 u64 bytes;
119};
120
121/*
122 * a collection of requests
123 */
124struct rbd_req_coll {
125 int total;
126 int num_done;
127 struct kref kref;
128 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700129};
130
Alex Elderf0f8cef2012-01-29 13:57:44 -0600131/*
132 * a single io request
133 */
134struct rbd_request {
135 struct request *rq; /* blk layer request */
136 struct bio *bio; /* cloned bio */
137 struct page **pages; /* list of used pages */
138 u64 len;
139 int coll_index;
140 struct rbd_req_coll *coll;
141};
142
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800143struct rbd_snap {
144 struct device dev;
145 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800146 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800147 struct list_head node;
148 u64 id;
149};
150
Alex Elderf84344f2012-08-31 17:29:51 -0500151struct rbd_mapping {
152 char *snap_name;
153 u64 snap_id;
154 bool snap_exists;
155 bool read_only;
156};
157
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158/*
159 * a single device
160 */
161struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500162 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700163
164 int major; /* blkdev assigned major */
165 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700166
Alex Elderf8c38922012-08-10 13:12:07 -0700167 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700168 struct rbd_client *rbd_client;
169
170 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
171
172 spinlock_t lock; /* queue lock */
173
174 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500175 char *image_name;
176 size_t image_name_len;
177 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500178 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500179 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700180
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700181 struct ceph_osd_event *watch_event;
182 struct ceph_osd_request *watch_request;
183
Josh Durginc6666012011-11-21 17:11:12 -0800184 /* protects updating the header */
185 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500186
187 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188
189 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800190
191 /* list of snapshots */
192 struct list_head snaps;
193
194 /* sysfs related */
195 struct device dev;
196};
197
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600199
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700200static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600201static DEFINE_SPINLOCK(rbd_dev_list_lock);
202
Alex Elder432b8582012-01-29 13:57:44 -0600203static LIST_HEAD(rbd_client_list); /* clients */
204static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700205
Alex Elder9fcbb802012-08-23 23:48:49 -0500206static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800207static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800208static ssize_t rbd_snap_add(struct device *dev,
209 struct device_attribute *attr,
210 const char *buf,
211 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500212static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800213
Alex Elderf0f8cef2012-01-29 13:57:44 -0600214static ssize_t rbd_add(struct bus_type *bus, const char *buf,
215 size_t count);
216static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
217 size_t count);
218
219static struct bus_attribute rbd_bus_attrs[] = {
220 __ATTR(add, S_IWUSR, NULL, rbd_add),
221 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
222 __ATTR_NULL
223};
224
225static struct bus_type rbd_bus_type = {
226 .name = "rbd",
227 .bus_attrs = rbd_bus_attrs,
228};
229
230static void rbd_root_dev_release(struct device *dev)
231{
232}
233
234static struct device rbd_root_dev = {
235 .init_name = "rbd",
236 .release = rbd_root_dev_release,
237};
238
Alex Elderaafb2302012-09-06 16:00:54 -0500239#ifdef RBD_DEBUG
240#define rbd_assert(expr) \
241 if (unlikely(!(expr))) { \
242 printk(KERN_ERR "\nAssertion failure in %s() " \
243 "at line %d:\n\n" \
244 "\trbd_assert(%s);\n\n", \
245 __func__, __LINE__, #expr); \
246 BUG(); \
247 }
248#else /* !RBD_DEBUG */
249# define rbd_assert(expr) ((void) 0)
250#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800251
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800252static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
253{
254 return get_device(&rbd_dev->dev);
255}
256
257static void rbd_put_dev(struct rbd_device *rbd_dev)
258{
259 put_device(&rbd_dev->dev);
260}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700261
Alex Elder1fe5e992012-07-25 09:32:41 -0500262static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700263
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700264static int rbd_open(struct block_device *bdev, fmode_t mode)
265{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600266 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267
Alex Elderf84344f2012-08-31 17:29:51 -0500268 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269 return -EROFS;
270
Alex Elder340c7a22012-08-10 13:12:07 -0700271 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500272 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700273
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700274 return 0;
275}
276
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800277static int rbd_release(struct gendisk *disk, fmode_t mode)
278{
279 struct rbd_device *rbd_dev = disk->private_data;
280
281 rbd_put_dev(rbd_dev);
282
283 return 0;
284}
285
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700286static const struct block_device_operations rbd_bd_ops = {
287 .owner = THIS_MODULE,
288 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800289 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700290};
291
292/*
293 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500294 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 */
Alex Elderf8c38922012-08-10 13:12:07 -0700296static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297{
298 struct rbd_client *rbdc;
299 int ret = -ENOMEM;
300
301 dout("rbd_client_create\n");
302 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
303 if (!rbdc)
304 goto out_opt;
305
306 kref_init(&rbdc->kref);
307 INIT_LIST_HEAD(&rbdc->node);
308
Alex Elderbc534d82012-01-29 13:57:44 -0600309 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
310
Alex Elder43ae4702012-07-03 16:01:18 -0500311 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700312 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600313 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500314 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700315
316 ret = ceph_open_session(rbdc->client);
317 if (ret < 0)
318 goto out_err;
319
Alex Elder432b8582012-01-29 13:57:44 -0600320 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700321 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600322 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700323
Alex Elderbc534d82012-01-29 13:57:44 -0600324 mutex_unlock(&ctl_mutex);
325
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700326 dout("rbd_client_create created %p\n", rbdc);
327 return rbdc;
328
329out_err:
330 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600331out_mutex:
332 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333 kfree(rbdc);
334out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500335 if (ceph_opts)
336 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400337 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700338}
339
340/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700341 * Find a ceph client with specific addr and configuration. If
342 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700343 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700344static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700345{
346 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700347 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700348
Alex Elder43ae4702012-07-03 16:01:18 -0500349 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350 return NULL;
351
Alex Elder1f7ba332012-08-10 13:12:07 -0700352 spin_lock(&rbd_client_list_lock);
353 list_for_each_entry(client_node, &rbd_client_list, node) {
354 if (!ceph_compare_options(ceph_opts, client_node->client)) {
355 kref_get(&client_node->kref);
356 found = true;
357 break;
358 }
359 }
360 spin_unlock(&rbd_client_list_lock);
361
362 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700363}
364
365/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700366 * mount options
367 */
368enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700369 Opt_last_int,
370 /* int args above */
371 Opt_last_string,
372 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700373 Opt_read_only,
374 Opt_read_write,
375 /* Boolean args above */
376 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700377};
378
Alex Elder43ae4702012-07-03 16:01:18 -0500379static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700380 /* int args above */
381 /* string args above */
Alex Elderf84344f2012-08-31 17:29:51 -0500382 {Opt_read_only, "mapping.read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700383 {Opt_read_only, "ro"}, /* Alternate spelling */
384 {Opt_read_write, "read_write"},
385 {Opt_read_write, "rw"}, /* Alternate spelling */
386 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700387 {-1, NULL}
388};
389
390static int parse_rbd_opts_token(char *c, void *private)
391{
Alex Elder43ae4702012-07-03 16:01:18 -0500392 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700393 substring_t argstr[MAX_OPT_ARGS];
394 int token, intval, ret;
395
Alex Elder43ae4702012-07-03 16:01:18 -0500396 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700397 if (token < 0)
398 return -EINVAL;
399
400 if (token < Opt_last_int) {
401 ret = match_int(&argstr[0], &intval);
402 if (ret < 0) {
403 pr_err("bad mount option arg (not int) "
404 "at '%s'\n", c);
405 return ret;
406 }
407 dout("got int token %d val %d\n", token, intval);
408 } else if (token > Opt_last_int && token < Opt_last_string) {
409 dout("got string token %d val %s\n", token,
410 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700411 } else if (token > Opt_last_string && token < Opt_last_bool) {
412 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700413 } else {
414 dout("got token %d\n", token);
415 }
416
417 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700418 case Opt_read_only:
419 rbd_opts->read_only = true;
420 break;
421 case Opt_read_write:
422 rbd_opts->read_only = false;
423 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700424 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500425 rbd_assert(false);
426 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700427 }
428 return 0;
429}
430
431/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432 * Get a ceph client with specific addr and configuration, if one does
433 * not exist create it.
434 */
Alex Elderf8c38922012-08-10 13:12:07 -0700435static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
436 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437{
Alex Elderf8c38922012-08-10 13:12:07 -0700438 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500439 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700440 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700441
Alex Eldercc0538b2012-08-10 13:12:07 -0700442 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443
Alex Elder43ae4702012-07-03 16:01:18 -0500444 ceph_opts = ceph_parse_options(options, mon_addr,
445 mon_addr + mon_addr_len,
446 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700447 if (IS_ERR(ceph_opts))
448 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700449
Alex Elder1f7ba332012-08-10 13:12:07 -0700450 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600452 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500453 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700454 } else {
455 rbdc = rbd_client_create(ceph_opts);
456 if (IS_ERR(rbdc))
457 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458 }
Alex Elderf8c38922012-08-10 13:12:07 -0700459 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700460
Alex Elderf8c38922012-08-10 13:12:07 -0700461 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700462}
463
464/*
465 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600466 *
Alex Elder432b8582012-01-29 13:57:44 -0600467 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468 */
469static void rbd_client_release(struct kref *kref)
470{
471 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
472
473 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500474 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700475 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500476 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700477
478 ceph_destroy_client(rbdc->client);
479 kfree(rbdc);
480}
481
482/*
483 * Drop reference to ceph client node. If it's not referenced anymore, release
484 * it.
485 */
486static void rbd_put_client(struct rbd_device *rbd_dev)
487{
488 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
489 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700490}
491
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700492/*
493 * Destroy requests collection
494 */
495static void rbd_coll_release(struct kref *kref)
496{
497 struct rbd_req_coll *coll =
498 container_of(kref, struct rbd_req_coll, kref);
499
500 dout("rbd_coll_release %p\n", coll);
501 kfree(coll);
502}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503
Alex Elder8e94af82012-07-25 09:32:40 -0500504static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
505{
Alex Elder103a1502012-08-02 11:29:45 -0500506 size_t size;
507 u32 snap_count;
508
509 /* The header has to start with the magic rbd header text */
510 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
511 return false;
512
513 /*
514 * The size of a snapshot header has to fit in a size_t, and
515 * that limits the number of snapshots.
516 */
517 snap_count = le32_to_cpu(ondisk->snap_count);
518 size = SIZE_MAX - sizeof (struct ceph_snap_context);
519 if (snap_count > size / sizeof (__le64))
520 return false;
521
522 /*
523 * Not only that, but the size of the entire the snapshot
524 * header must also be representable in a size_t.
525 */
526 size -= snap_count * sizeof (__le64);
527 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
528 return false;
529
530 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500531}
532
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700533/*
534 * Create a new header structure, translate header format from the on-disk
535 * header.
536 */
537static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500538 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700539{
Alex Elderccece232012-07-10 20:30:10 -0500540 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500541 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500542 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500543 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700544
Alex Elder6a523252012-07-19 17:12:59 -0500545 memset(header, 0, sizeof (*header));
546
Alex Elder103a1502012-08-02 11:29:45 -0500547 snap_count = le32_to_cpu(ondisk->snap_count);
548
Alex Elder58c17b02012-08-23 23:22:06 -0500549 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
550 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500551 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700552 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500553 memcpy(header->object_prefix, ondisk->object_prefix, len);
554 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600555
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700556 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500557 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
558
Alex Elder621901d2012-08-23 23:22:06 -0500559 /* Save a copy of the snapshot names */
560
Alex Elderf785cc12012-08-23 23:22:06 -0500561 if (snap_names_len > (u64) SIZE_MAX)
562 return -EIO;
563 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700564 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500565 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500566 /*
567 * Note that rbd_dev_v1_header_read() guarantees
568 * the ondisk buffer we're working with has
569 * snap_names_len bytes beyond the end of the
570 * snapshot id array, this memcpy() is safe.
571 */
572 memcpy(header->snap_names, &ondisk->snaps[snap_count],
573 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500574
Alex Elder621901d2012-08-23 23:22:06 -0500575 /* Record each snapshot's size */
576
Alex Elderd2bb24e2012-07-26 23:37:14 -0500577 size = snap_count * sizeof (*header->snap_sizes);
578 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700579 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500580 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500581 for (i = 0; i < snap_count; i++)
582 header->snap_sizes[i] =
583 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584 } else {
Alex Elderccece232012-07-10 20:30:10 -0500585 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586 header->snap_names = NULL;
587 header->snap_sizes = NULL;
588 }
Alex Elder849b4262012-07-09 21:04:24 -0500589
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590 header->obj_order = ondisk->options.order;
591 header->crypt_type = ondisk->options.crypt_type;
592 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500593
Alex Elder621901d2012-08-23 23:22:06 -0500594 /* Allocate and fill in the snapshot context */
595
Alex Elderf84344f2012-08-31 17:29:51 -0500596 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500597 size = sizeof (struct ceph_snap_context);
598 size += snap_count * sizeof (header->snapc->snaps[0]);
599 header->snapc = kzalloc(size, GFP_KERNEL);
600 if (!header->snapc)
601 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700602
603 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500604 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500606 for (i = 0; i < snap_count; i++)
607 header->snapc->snaps[i] =
608 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609
610 return 0;
611
Alex Elder6a523252012-07-19 17:12:59 -0500612out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500613 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500614 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700615 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500616 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500617 kfree(header->object_prefix);
618 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500619
Alex Elder00f1f362012-02-07 12:03:36 -0600620 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621}
622
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700623static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
624 u64 *seq, u64 *size)
625{
626 int i;
627 char *p = header->snap_names;
628
Alex Elderc9aadfe2012-08-30 14:42:15 -0500629 rbd_assert(header->snapc != NULL);
630 for (i = 0; i < header->snapc->num_snaps; i++) {
Alex Elder00f1f362012-02-07 12:03:36 -0600631 if (!strcmp(snap_name, p)) {
632
633 /* Found it. Pass back its id and/or size */
634
635 if (seq)
636 *seq = header->snapc->snaps[i];
637 if (size)
638 *size = header->snap_sizes[i];
639 return i;
640 }
641 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 }
Alex Elder00f1f362012-02-07 12:03:36 -0600643 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644}
645
Alex Elder0ce1a792012-07-03 16:01:18 -0500646static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700647{
Alex Elder78dc4472012-07-19 08:49:18 -0500648 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700649
Alex Elder0ce1a792012-07-03 16:01:18 -0500650 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651
Alex Elderf84344f2012-08-31 17:29:51 -0500652 if (!memcmp(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800653 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500654 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
655 rbd_dev->mapping.snap_exists = false;
656 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500658 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500660 u64 snap_id = 0;
661
Alex Elderf84344f2012-08-31 17:29:51 -0500662 ret = snap_by_name(&rbd_dev->header,
663 rbd_dev->mapping.snap_name,
Alex Elder78dc4472012-07-19 08:49:18 -0500664 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 if (ret < 0)
666 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500667 rbd_dev->mapping.snap_id = snap_id;
668 rbd_dev->mapping.snap_exists = true;
669 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700670 }
671
672 ret = 0;
673done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500674 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700675 return ret;
676}
677
678static void rbd_header_free(struct rbd_image_header *header)
679{
Alex Elder849b4262012-07-09 21:04:24 -0500680 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500681 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500683 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500684 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500685 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800686 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500687 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700688}
689
Alex Elder65ccfe22012-08-09 10:33:26 -0700690static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700691{
Alex Elder65ccfe22012-08-09 10:33:26 -0700692 char *name;
693 u64 segment;
694 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695
Alex Elder65ccfe22012-08-09 10:33:26 -0700696 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
697 if (!name)
698 return NULL;
699 segment = offset >> rbd_dev->header.obj_order;
700 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
701 rbd_dev->header.object_prefix, segment);
702 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
703 pr_err("error formatting segment name for #%llu (%d)\n",
704 segment, ret);
705 kfree(name);
706 name = NULL;
707 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708
Alex Elder65ccfe22012-08-09 10:33:26 -0700709 return name;
710}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700711
Alex Elder65ccfe22012-08-09 10:33:26 -0700712static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
713{
714 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700715
Alex Elder65ccfe22012-08-09 10:33:26 -0700716 return offset & (segment_size - 1);
717}
718
719static u64 rbd_segment_length(struct rbd_device *rbd_dev,
720 u64 offset, u64 length)
721{
722 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
723
724 offset &= segment_size - 1;
725
Alex Elderaafb2302012-09-06 16:00:54 -0500726 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700727 if (offset + length > segment_size)
728 length = segment_size - offset;
729
730 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700731}
732
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700733static int rbd_get_num_segments(struct rbd_image_header *header,
734 u64 ofs, u64 len)
735{
Alex Elderdf111be2012-08-09 10:33:26 -0700736 u64 start_seg;
737 u64 end_seg;
738
739 if (!len)
740 return 0;
741 if (len - 1 > U64_MAX - ofs)
742 return -ERANGE;
743
744 start_seg = ofs >> header->obj_order;
745 end_seg = (ofs + len - 1) >> header->obj_order;
746
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700747 return end_seg - start_seg + 1;
748}
749
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700751 * returns the size of an object in the image
752 */
753static u64 rbd_obj_bytes(struct rbd_image_header *header)
754{
755 return 1 << header->obj_order;
756}
757
758/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700759 * bio helpers
760 */
761
762static void bio_chain_put(struct bio *chain)
763{
764 struct bio *tmp;
765
766 while (chain) {
767 tmp = chain;
768 chain = chain->bi_next;
769 bio_put(tmp);
770 }
771}
772
773/*
774 * zeros a bio chain, starting at specific offset
775 */
776static void zero_bio_chain(struct bio *chain, int start_ofs)
777{
778 struct bio_vec *bv;
779 unsigned long flags;
780 void *buf;
781 int i;
782 int pos = 0;
783
784 while (chain) {
785 bio_for_each_segment(bv, chain, i) {
786 if (pos + bv->bv_len > start_ofs) {
787 int remainder = max(start_ofs - pos, 0);
788 buf = bvec_kmap_irq(bv, &flags);
789 memset(buf + remainder, 0,
790 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200791 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700792 }
793 pos += bv->bv_len;
794 }
795
796 chain = chain->bi_next;
797 }
798}
799
800/*
801 * bio_chain_clone - clone a chain of bios up to a certain length.
802 * might return a bio_pair that will need to be released.
803 */
804static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
805 struct bio_pair **bp,
806 int len, gfp_t gfpmask)
807{
Alex Elder542582f2012-08-09 10:33:25 -0700808 struct bio *old_chain = *old;
809 struct bio *new_chain = NULL;
810 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700811 int total = 0;
812
813 if (*bp) {
814 bio_pair_release(*bp);
815 *bp = NULL;
816 }
817
818 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700819 struct bio *tmp;
820
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700821 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
822 if (!tmp)
823 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700824 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700825
826 if (total + old_chain->bi_size > len) {
827 struct bio_pair *bp;
828
829 /*
830 * this split can only happen with a single paged bio,
831 * split_bio will BUG_ON if this is not the case
832 */
833 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500834 "bi_size=%u\n",
835 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700836
837 /* split the bio. We'll release it either in the next
838 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600839 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700840 if (!bp)
841 goto err_out;
842
843 __bio_clone(tmp, &bp->bio1);
844
845 *next = &bp->bio2;
846 } else {
847 __bio_clone(tmp, old_chain);
848 *next = old_chain->bi_next;
849 }
850
851 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700852 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700853 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700855 else
856 new_chain = tmp;
857 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700858 old_chain = old_chain->bi_next;
859
860 total += tmp->bi_size;
861 }
862
Alex Elderaafb2302012-09-06 16:00:54 -0500863 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700864
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865 *old = old_chain;
866
867 return new_chain;
868
869err_out:
870 dout("bio_chain_clone with err\n");
871 bio_chain_put(new_chain);
872 return NULL;
873}
874
875/*
876 * helpers for osd request op vectors.
877 */
Alex Elder57cfc102012-06-26 12:57:03 -0700878static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
879 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700880{
Alex Elder57cfc102012-06-26 12:57:03 -0700881 struct ceph_osd_req_op *ops;
882
883 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
884 if (!ops)
885 return NULL;
886
887 ops[0].op = opcode;
888
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889 /*
890 * op extent offset and length will be set later on
891 * in calc_raw_layout()
892 */
Alex Elder57cfc102012-06-26 12:57:03 -0700893 ops[0].payload_len = payload_len;
894
895 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896}
897
898static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
899{
900 kfree(ops);
901}
902
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700903static void rbd_coll_end_req_index(struct request *rq,
904 struct rbd_req_coll *coll,
905 int index,
906 int ret, u64 len)
907{
908 struct request_queue *q;
909 int min, max, i;
910
Alex Elderbd919d42012-07-13 20:35:11 -0500911 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
912 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700913
914 if (!rq)
915 return;
916
917 if (!coll) {
918 blk_end_request(rq, ret, len);
919 return;
920 }
921
922 q = rq->q;
923
924 spin_lock_irq(q->queue_lock);
925 coll->status[index].done = 1;
926 coll->status[index].rc = ret;
927 coll->status[index].bytes = len;
928 max = min = coll->num_done;
929 while (max < coll->total && coll->status[max].done)
930 max++;
931
932 for (i = min; i<max; i++) {
933 __blk_end_request(rq, coll->status[i].rc,
934 coll->status[i].bytes);
935 coll->num_done++;
936 kref_put(&coll->kref, rbd_coll_release);
937 }
938 spin_unlock_irq(q->queue_lock);
939}
940
941static void rbd_coll_end_req(struct rbd_request *req,
942 int ret, u64 len)
943{
944 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
945}
946
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947/*
948 * Send ceph osd request
949 */
950static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500951 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700952 struct ceph_snap_context *snapc,
953 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500954 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700955 struct bio *bio,
956 struct page **pages,
957 int num_pages,
958 int flags,
959 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700960 struct rbd_req_coll *coll,
961 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700963 struct ceph_msg *msg),
964 struct ceph_osd_request **linger_req,
965 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966{
967 struct ceph_osd_request *req;
968 struct ceph_file_layout *layout;
969 int ret;
970 u64 bno;
971 struct timespec mtime = CURRENT_TIME;
972 struct rbd_request *req_data;
973 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600974 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700976 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700977 if (!req_data) {
978 if (coll)
979 rbd_coll_end_req_index(rq, coll, coll_index,
980 -ENOMEM, len);
981 return -ENOMEM;
982 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700983
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700984 if (coll) {
985 req_data->coll = coll;
986 req_data->coll_index = coll_index;
987 }
988
Alex Elderbd919d42012-07-13 20:35:11 -0500989 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
990 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700991
Alex Elder0ce1a792012-07-03 16:01:18 -0500992 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600993 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
994 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700995 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700996 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700997 goto done_pages;
998 }
999
1000 req->r_callback = rbd_cb;
1001
1002 req_data->rq = rq;
1003 req_data->bio = bio;
1004 req_data->pages = pages;
1005 req_data->len = len;
1006
1007 req->r_priv = req_data;
1008
1009 reqhead = req->r_request->front.iov_base;
1010 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1011
Alex Elderaded07e2012-07-03 16:01:18 -05001012 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001013 req->r_oid_len = strlen(req->r_oid);
1014
1015 layout = &req->r_file_layout;
1016 memset(layout, 0, sizeof(*layout));
1017 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1018 layout->fl_stripe_count = cpu_to_le32(1);
1019 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001020 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001021 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1022 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023
1024 ceph_osdc_build_request(req, ofs, &len,
1025 ops,
1026 snapc,
1027 &mtime,
1028 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001029
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001030 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001031 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001032 *linger_req = req;
1033 }
1034
Alex Elder1dbb4392012-01-24 10:08:37 -06001035 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036 if (ret < 0)
1037 goto done_err;
1038
1039 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001040 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001041 if (ver)
1042 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001043 dout("reassert_ver=%llu\n",
1044 (unsigned long long)
1045 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001046 ceph_osdc_put_request(req);
1047 }
1048 return ret;
1049
1050done_err:
1051 bio_chain_put(req_data->bio);
1052 ceph_osdc_put_request(req);
1053done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001054 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001055 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056 return ret;
1057}
1058
1059/*
1060 * Ceph osd op callback
1061 */
1062static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1063{
1064 struct rbd_request *req_data = req->r_priv;
1065 struct ceph_osd_reply_head *replyhead;
1066 struct ceph_osd_op *op;
1067 __s32 rc;
1068 u64 bytes;
1069 int read_op;
1070
1071 /* parse reply */
1072 replyhead = msg->front.iov_base;
1073 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1074 op = (void *)(replyhead + 1);
1075 rc = le32_to_cpu(replyhead->result);
1076 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001077 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001078
Alex Elderbd919d42012-07-13 20:35:11 -05001079 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1080 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001081
1082 if (rc == -ENOENT && read_op) {
1083 zero_bio_chain(req_data->bio, 0);
1084 rc = 0;
1085 } else if (rc == 0 && read_op && bytes < req_data->len) {
1086 zero_bio_chain(req_data->bio, bytes);
1087 bytes = req_data->len;
1088 }
1089
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001090 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001091
1092 if (req_data->bio)
1093 bio_chain_put(req_data->bio);
1094
1095 ceph_osdc_put_request(req);
1096 kfree(req_data);
1097}
1098
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001099static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1100{
1101 ceph_osdc_put_request(req);
1102}
1103
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104/*
1105 * Do a synchronous ceph osd operation
1106 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001107static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001108 struct ceph_snap_context *snapc,
1109 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001110 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001111 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001112 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001113 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001114 char *buf,
1115 struct ceph_osd_request **linger_req,
1116 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117{
1118 int ret;
1119 struct page **pages;
1120 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001121
Alex Elderaafb2302012-09-06 16:00:54 -05001122 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123
1124 num_pages = calc_pages_for(ofs , len);
1125 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001126 if (IS_ERR(pages))
1127 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001128
Alex Elder0ce1a792012-07-03 16:01:18 -05001129 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001130 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001131 pages, num_pages,
1132 flags,
1133 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001134 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001135 NULL,
1136 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001138 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001139
1140 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1141 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1142
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143done:
1144 ceph_release_page_vector(pages, num_pages);
1145 return ret;
1146}
1147
1148/*
1149 * Do an asynchronous ceph osd operation
1150 */
1151static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001152 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153 struct ceph_snap_context *snapc,
1154 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001155 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001156 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001157 struct bio *bio,
1158 struct rbd_req_coll *coll,
1159 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001160{
1161 char *seg_name;
1162 u64 seg_ofs;
1163 u64 seg_len;
1164 int ret;
1165 struct ceph_osd_req_op *ops;
1166 u32 payload_len;
1167
Alex Elder65ccfe22012-08-09 10:33:26 -07001168 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001169 if (!seg_name)
1170 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001171 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1172 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173
1174 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1175
Alex Elder57cfc102012-06-26 12:57:03 -07001176 ret = -ENOMEM;
1177 ops = rbd_create_rw_ops(1, opcode, payload_len);
1178 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179 goto done;
1180
1181 /* we've taken care of segment sizes earlier when we
1182 cloned the bios. We should never have a segment
1183 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001184 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001185
1186 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1187 seg_name, seg_ofs, seg_len,
1188 bio,
1189 NULL, 0,
1190 flags,
1191 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001192 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001193 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001194
1195 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001196done:
1197 kfree(seg_name);
1198 return ret;
1199}
1200
1201/*
1202 * Request async osd write
1203 */
1204static int rbd_req_write(struct request *rq,
1205 struct rbd_device *rbd_dev,
1206 struct ceph_snap_context *snapc,
1207 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001208 struct bio *bio,
1209 struct rbd_req_coll *coll,
1210 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001211{
1212 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1213 CEPH_OSD_OP_WRITE,
1214 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001215 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001216}
1217
1218/*
1219 * Request async osd read
1220 */
1221static int rbd_req_read(struct request *rq,
1222 struct rbd_device *rbd_dev,
1223 u64 snapid,
1224 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001225 struct bio *bio,
1226 struct rbd_req_coll *coll,
1227 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001228{
1229 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001230 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231 CEPH_OSD_OP_READ,
1232 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001233 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234}
1235
1236/*
1237 * Request sync osd read
1238 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001239static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001240 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001241 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001243 char *buf,
1244 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001245{
Alex Elder913d2fd2012-06-26 12:57:03 -07001246 struct ceph_osd_req_op *ops;
1247 int ret;
1248
1249 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1250 if (!ops)
1251 return -ENOMEM;
1252
1253 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001254 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001256 ops, object_name, ofs, len, buf, NULL, ver);
1257 rbd_destroy_ops(ops);
1258
1259 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001260}
1261
1262/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001263 * Request sync osd watch
1264 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001265static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001266 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001267 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001268{
1269 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001270 int ret;
1271
Alex Elder57cfc102012-06-26 12:57:03 -07001272 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1273 if (!ops)
1274 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001275
Josh Durgina71b8912011-12-05 18:10:44 -08001276 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001277 ops[0].watch.cookie = notify_id;
1278 ops[0].watch.flag = 0;
1279
Alex Elder0ce1a792012-07-03 16:01:18 -05001280 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001281 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001282 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001283 CEPH_OSD_FLAG_READ,
1284 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001285 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001286 rbd_simple_req_cb, 0, NULL);
1287
1288 rbd_destroy_ops(ops);
1289 return ret;
1290}
1291
1292static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1293{
Alex Elder0ce1a792012-07-03 16:01:18 -05001294 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001295 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001296 int rc;
1297
Alex Elder0ce1a792012-07-03 16:01:18 -05001298 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001299 return;
1300
Alex Elderbd919d42012-07-13 20:35:11 -05001301 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1302 rbd_dev->header_name, (unsigned long long) notify_id,
1303 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001304 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001305 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001306 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001307 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001308
Alex Elder7f0a24d2012-07-25 09:32:40 -05001309 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310}
1311
1312/*
1313 * Request sync osd watch
1314 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001315static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316{
1317 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001318 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001319 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001320
Alex Elder57cfc102012-06-26 12:57:03 -07001321 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1322 if (!ops)
1323 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001324
1325 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001326 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327 if (ret < 0)
1328 goto fail;
1329
Alex Elder0e6f3222012-07-25 09:32:40 -05001330 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001331 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001332 ops[0].watch.flag = 1;
1333
Alex Elder0ce1a792012-07-03 16:01:18 -05001334 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001335 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001336 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1337 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001338 rbd_dev->header_name,
1339 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001340 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341
1342 if (ret < 0)
1343 goto fail_event;
1344
1345 rbd_destroy_ops(ops);
1346 return 0;
1347
1348fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001349 ceph_osdc_cancel_event(rbd_dev->watch_event);
1350 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001351fail:
1352 rbd_destroy_ops(ops);
1353 return ret;
1354}
1355
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001356/*
1357 * Request sync osd unwatch
1358 */
Alex Elder070c6332012-07-25 09:32:41 -05001359static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001360{
1361 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001362 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001363
Alex Elder57cfc102012-06-26 12:57:03 -07001364 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1365 if (!ops)
1366 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001367
1368 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001369 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001370 ops[0].watch.flag = 0;
1371
Alex Elder0ce1a792012-07-03 16:01:18 -05001372 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001373 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001374 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1375 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001376 rbd_dev->header_name,
1377 0, 0, NULL, NULL, NULL);
1378
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001379
1380 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001381 ceph_osdc_cancel_event(rbd_dev->watch_event);
1382 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001383 return ret;
1384}
1385
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001387 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001388};
1389
1390static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1391{
Alex Elder0ce1a792012-07-03 16:01:18 -05001392 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1393 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001394 return;
1395
Alex Elderbd919d42012-07-13 20:35:11 -05001396 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1397 rbd_dev->header_name, (unsigned long long) notify_id,
1398 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001399}
1400
1401/*
1402 * Request sync osd notify
1403 */
Alex Elder4cb16252012-07-25 09:32:40 -05001404static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001405{
1406 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001407 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001408 struct ceph_osd_event *event;
1409 struct rbd_notify_info info;
1410 int payload_len = sizeof(u32) + sizeof(u32);
1411 int ret;
1412
Alex Elder57cfc102012-06-26 12:57:03 -07001413 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1414 if (!ops)
1415 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001416
Alex Elder0ce1a792012-07-03 16:01:18 -05001417 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001418
1419 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1420 (void *)&info, &event);
1421 if (ret < 0)
1422 goto fail;
1423
1424 ops[0].watch.ver = 1;
1425 ops[0].watch.flag = 1;
1426 ops[0].watch.cookie = event->cookie;
1427 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1428 ops[0].watch.timeout = 12;
1429
Alex Elder0ce1a792012-07-03 16:01:18 -05001430 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001431 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001432 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1433 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001434 rbd_dev->header_name,
1435 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001436 if (ret < 0)
1437 goto fail_event;
1438
1439 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1440 dout("ceph_osdc_wait_event returned %d\n", ret);
1441 rbd_destroy_ops(ops);
1442 return 0;
1443
1444fail_event:
1445 ceph_osdc_cancel_event(event);
1446fail:
1447 rbd_destroy_ops(ops);
1448 return ret;
1449}
1450
1451/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001452 * Request sync osd read
1453 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001454static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001455 const char *object_name,
1456 const char *class_name,
1457 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001458 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001459 int len,
1460 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001461{
1462 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001463 int class_name_len = strlen(class_name);
1464 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001465 int ret;
1466
1467 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001468 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001469 if (!ops)
1470 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001471
Alex Elderaded07e2012-07-03 16:01:18 -05001472 ops[0].cls.class_name = class_name;
1473 ops[0].cls.class_len = (__u8) class_name_len;
1474 ops[0].cls.method_name = method_name;
1475 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001476 ops[0].cls.argc = 0;
1477 ops[0].cls.indata = data;
1478 ops[0].cls.indata_len = len;
1479
Alex Elder0ce1a792012-07-03 16:01:18 -05001480 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001481 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001482 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1483 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001484 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001485
1486 rbd_destroy_ops(ops);
1487
1488 dout("cls_exec returned %d\n", ret);
1489 return ret;
1490}
1491
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001492static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1493{
1494 struct rbd_req_coll *coll =
1495 kzalloc(sizeof(struct rbd_req_coll) +
1496 sizeof(struct rbd_req_status) * num_reqs,
1497 GFP_ATOMIC);
1498
1499 if (!coll)
1500 return NULL;
1501 coll->total = num_reqs;
1502 kref_init(&coll->kref);
1503 return coll;
1504}
1505
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001506/*
1507 * block device queue callback
1508 */
1509static void rbd_rq_fn(struct request_queue *q)
1510{
1511 struct rbd_device *rbd_dev = q->queuedata;
1512 struct request *rq;
1513 struct bio_pair *bp = NULL;
1514
Alex Elder00f1f362012-02-07 12:03:36 -06001515 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001516 struct bio *bio;
1517 struct bio *rq_bio, *next_bio = NULL;
1518 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001519 unsigned int size;
1520 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001522 int num_segs, cur_seg = 0;
1523 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001524 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001526 dout("fetched request\n");
1527
1528 /* filter out block requests we don't understand */
1529 if ((rq->cmd_type != REQ_TYPE_FS)) {
1530 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001531 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001532 }
1533
1534 /* deduce our operation (read, write) */
1535 do_write = (rq_data_dir(rq) == WRITE);
1536
1537 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001538 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001540 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001541 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001542 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 }
1544
1545 spin_unlock_irq(q->queue_lock);
1546
Josh Durgind1d25642011-12-05 14:03:05 -08001547 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001548
Alex Elderf84344f2012-08-31 17:29:51 -05001549 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1550 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001551 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001552 dout("request for non-existent snapshot");
1553 spin_lock_irq(q->queue_lock);
1554 __blk_end_request_all(rq, -ENXIO);
1555 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001556 }
1557
Josh Durgind1d25642011-12-05 14:03:05 -08001558 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1559
1560 up_read(&rbd_dev->header_rwsem);
1561
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001562 dout("%s 0x%x bytes at 0x%llx\n",
1563 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001564 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001565
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001566 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001567 if (num_segs <= 0) {
1568 spin_lock_irq(q->queue_lock);
1569 __blk_end_request_all(rq, num_segs);
1570 ceph_put_snap_context(snapc);
1571 continue;
1572 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001573 coll = rbd_alloc_coll(num_segs);
1574 if (!coll) {
1575 spin_lock_irq(q->queue_lock);
1576 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001577 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001578 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001579 }
1580
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001581 do {
1582 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001583 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001584 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001585 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001586 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1587 op_size, GFP_ATOMIC);
1588 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001589 rbd_coll_end_req_index(rq, coll, cur_seg,
1590 -ENOMEM, op_size);
1591 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001592 }
1593
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001594
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001595 /* init OSD command: write or read */
1596 if (do_write)
1597 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001598 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001599 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001600 op_size, bio,
1601 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001602 else
1603 rbd_req_read(rq, rbd_dev,
Alex Elderf84344f2012-08-31 17:29:51 -05001604 rbd_dev->mapping.snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001605 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001606 op_size, bio,
1607 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001608
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001609next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001610 size -= op_size;
1611 ofs += op_size;
1612
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001613 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001614 rq_bio = next_bio;
1615 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001616 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001617
1618 if (bp)
1619 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001620 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001621
1622 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001623 }
1624}
1625
1626/*
1627 * a queue callback. Makes sure that we don't create a bio that spans across
1628 * multiple osd objects. One exception would be with a single page bios,
1629 * which we handle later at bio_chain_clone
1630 */
1631static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1632 struct bio_vec *bvec)
1633{
1634 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001635 unsigned int chunk_sectors;
1636 sector_t sector;
1637 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001638 int max;
1639
Alex Elder593a9e72012-02-07 12:03:37 -06001640 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1641 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1642 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1643
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001644 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001645 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001646 if (max < 0)
1647 max = 0; /* bio_add cannot handle a negative return */
1648 if (max <= bvec->bv_len && bio_sectors == 0)
1649 return bvec->bv_len;
1650 return max;
1651}
1652
1653static void rbd_free_disk(struct rbd_device *rbd_dev)
1654{
1655 struct gendisk *disk = rbd_dev->disk;
1656
1657 if (!disk)
1658 return;
1659
1660 rbd_header_free(&rbd_dev->header);
1661
1662 if (disk->flags & GENHD_FL_UP)
1663 del_gendisk(disk);
1664 if (disk->queue)
1665 blk_cleanup_queue(disk->queue);
1666 put_disk(disk);
1667}
1668
1669/*
Alex Elder4156d992012-08-02 11:29:46 -05001670 * Read the complete header for the given rbd device.
1671 *
1672 * Returns a pointer to a dynamically-allocated buffer containing
1673 * the complete and validated header. Caller can pass the address
1674 * of a variable that will be filled in with the version of the
1675 * header object at the time it was read.
1676 *
1677 * Returns a pointer-coded errno if a failure occurs.
1678 */
1679static struct rbd_image_header_ondisk *
1680rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1681{
1682 struct rbd_image_header_ondisk *ondisk = NULL;
1683 u32 snap_count = 0;
1684 u64 names_size = 0;
1685 u32 want_count;
1686 int ret;
1687
1688 /*
1689 * The complete header will include an array of its 64-bit
1690 * snapshot ids, followed by the names of those snapshots as
1691 * a contiguous block of NUL-terminated strings. Note that
1692 * the number of snapshots could change by the time we read
1693 * it in, in which case we re-read it.
1694 */
1695 do {
1696 size_t size;
1697
1698 kfree(ondisk);
1699
1700 size = sizeof (*ondisk);
1701 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1702 size += names_size;
1703 ondisk = kmalloc(size, GFP_KERNEL);
1704 if (!ondisk)
1705 return ERR_PTR(-ENOMEM);
1706
1707 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1708 rbd_dev->header_name,
1709 0, size,
1710 (char *) ondisk, version);
1711
1712 if (ret < 0)
1713 goto out_err;
1714 if (WARN_ON((size_t) ret < size)) {
1715 ret = -ENXIO;
1716 pr_warning("short header read for image %s"
1717 " (want %zd got %d)\n",
1718 rbd_dev->image_name, size, ret);
1719 goto out_err;
1720 }
1721 if (!rbd_dev_ondisk_valid(ondisk)) {
1722 ret = -ENXIO;
1723 pr_warning("invalid header for image %s\n",
1724 rbd_dev->image_name);
1725 goto out_err;
1726 }
1727
1728 names_size = le64_to_cpu(ondisk->snap_names_len);
1729 want_count = snap_count;
1730 snap_count = le32_to_cpu(ondisk->snap_count);
1731 } while (snap_count != want_count);
1732
1733 return ondisk;
1734
1735out_err:
1736 kfree(ondisk);
1737
1738 return ERR_PTR(ret);
1739}
1740
1741/*
1742 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001743 */
1744static int rbd_read_header(struct rbd_device *rbd_dev,
1745 struct rbd_image_header *header)
1746{
Alex Elder4156d992012-08-02 11:29:46 -05001747 struct rbd_image_header_ondisk *ondisk;
1748 u64 ver = 0;
1749 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001750
Alex Elder4156d992012-08-02 11:29:46 -05001751 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1752 if (IS_ERR(ondisk))
1753 return PTR_ERR(ondisk);
1754 ret = rbd_header_from_disk(header, ondisk);
1755 if (ret >= 0)
1756 header->obj_version = ver;
1757 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001758
Alex Elder4156d992012-08-02 11:29:46 -05001759 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001760}
1761
1762/*
1763 * create a snapshot
1764 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001765static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001766 const char *snap_name,
1767 gfp_t gfp_flags)
1768{
1769 int name_len = strlen(snap_name);
1770 u64 new_snapid;
1771 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001772 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001773 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001774
1775 /* we should create a snapshot only if we're pointing at the head */
Alex Elderf84344f2012-08-31 17:29:51 -05001776 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001777 return -EINVAL;
1778
Alex Elder0ce1a792012-07-03 16:01:18 -05001779 monc = &rbd_dev->rbd_client->client->monc;
1780 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001781 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001782 if (ret < 0)
1783 return ret;
1784
1785 data = kmalloc(name_len + 16, gfp_flags);
1786 if (!data)
1787 return -ENOMEM;
1788
Sage Weil916d4d62011-05-12 16:10:50 -07001789 p = data;
1790 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001791
Sage Weil916d4d62011-05-12 16:10:50 -07001792 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1793 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001794
Alex Elder0bed54d2012-07-03 16:01:18 -05001795 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001796 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001797 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001798
Sage Weil916d4d62011-05-12 16:10:50 -07001799 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001800
Alex Elder505cbb92012-07-19 08:49:18 -05001801 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001802bad:
1803 return -ERANGE;
1804}
1805
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001806static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1807{
1808 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001809 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001810
Alex Eldera0593292012-07-19 09:09:27 -05001811 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001812 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001813}
1814
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001815/*
1816 * only read the first part of the ondisk header, without the snaps info
1817 */
Alex Elderb8136232012-07-25 09:32:41 -05001818static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001819{
1820 int ret;
1821 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001822
1823 ret = rbd_read_header(rbd_dev, &h);
1824 if (ret < 0)
1825 return ret;
1826
Josh Durgina51aa0c2011-12-05 10:35:04 -08001827 down_write(&rbd_dev->header_rwsem);
1828
Sage Weil9db4b3e2011-04-19 22:49:06 -07001829 /* resized? */
Alex Elderf84344f2012-08-31 17:29:51 -05001830 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
Josh Durgin474ef7c2011-11-21 17:13:54 -08001831 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1832
1833 dout("setting size to %llu sectors", (unsigned long long) size);
1834 set_capacity(rbd_dev->disk, size);
1835 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001836
Alex Elder849b4262012-07-09 21:04:24 -05001837 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001838 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001839 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001840 /* osd requests may still refer to snapc */
1841 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001842
Alex Elderb8136232012-07-25 09:32:41 -05001843 if (hver)
1844 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001845 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001846 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001847 rbd_dev->header.snapc = h.snapc;
1848 rbd_dev->header.snap_names = h.snap_names;
1849 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001850 /* Free the extra copy of the object prefix */
1851 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1852 kfree(h.object_prefix);
1853
Alex Elder9fcbb802012-08-23 23:48:49 -05001854 ret = rbd_dev_snap_devs_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001855
Josh Durginc6666012011-11-21 17:11:12 -08001856 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001857
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001858 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001859}
1860
Alex Elder1fe5e992012-07-25 09:32:41 -05001861static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1862{
1863 int ret;
1864
1865 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1866 ret = __rbd_refresh_header(rbd_dev, hver);
1867 mutex_unlock(&ctl_mutex);
1868
1869 return ret;
1870}
1871
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001872static int rbd_init_disk(struct rbd_device *rbd_dev)
1873{
1874 struct gendisk *disk;
1875 struct request_queue *q;
1876 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001877 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001878 u64 total_size = 0;
1879
1880 /* contact OSD, request size info about the object being mapped */
1881 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1882 if (rc)
1883 return rc;
1884
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001885 /* no need to lock here, as rbd_dev is not registered yet */
Alex Elder9fcbb802012-08-23 23:48:49 -05001886 rc = rbd_dev_snap_devs_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001887 if (rc)
1888 return rc;
1889
Josh Durgincc9d7342011-11-21 18:19:13 -08001890 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001891 if (rc)
1892 return rc;
1893
1894 /* create gendisk info */
1895 rc = -ENOMEM;
1896 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1897 if (!disk)
1898 goto out;
1899
Alex Elderf0f8cef2012-01-29 13:57:44 -06001900 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001901 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001902 disk->major = rbd_dev->major;
1903 disk->first_minor = 0;
1904 disk->fops = &rbd_bd_ops;
1905 disk->private_data = rbd_dev;
1906
1907 /* init rq */
1908 rc = -ENOMEM;
1909 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1910 if (!q)
1911 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001912
Alex Elder593a9e72012-02-07 12:03:37 -06001913 /* We use the default size, but let's be explicit about it. */
1914 blk_queue_physical_block_size(q, SECTOR_SIZE);
1915
Josh Durgin029bcbd2011-07-22 11:35:23 -07001916 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001917 segment_size = rbd_obj_bytes(&rbd_dev->header);
1918 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1919 blk_queue_max_segment_size(q, segment_size);
1920 blk_queue_io_min(q, segment_size);
1921 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001922
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001923 blk_queue_merge_bvec(q, rbd_merge_bvec);
1924 disk->queue = q;
1925
1926 q->queuedata = rbd_dev;
1927
1928 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001929
1930 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001931 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001932 add_disk(disk);
1933
1934 pr_info("%s: added with size 0x%llx\n",
1935 disk->disk_name, (unsigned long long)total_size);
1936 return 0;
1937
1938out_disk:
1939 put_disk(disk);
1940out:
1941 return rc;
1942}
1943
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001944/*
1945 sysfs
1946*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001947
Alex Elder593a9e72012-02-07 12:03:37 -06001948static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1949{
1950 return container_of(dev, struct rbd_device, dev);
1951}
1952
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001953static ssize_t rbd_size_show(struct device *dev,
1954 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001955{
Alex Elder593a9e72012-02-07 12:03:37 -06001956 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001957 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001958
Josh Durgina51aa0c2011-12-05 10:35:04 -08001959 down_read(&rbd_dev->header_rwsem);
1960 size = get_capacity(rbd_dev->disk);
1961 up_read(&rbd_dev->header_rwsem);
1962
1963 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001964}
1965
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001966static ssize_t rbd_major_show(struct device *dev,
1967 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001968{
Alex Elder593a9e72012-02-07 12:03:37 -06001969 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001970
1971 return sprintf(buf, "%d\n", rbd_dev->major);
1972}
1973
1974static ssize_t rbd_client_id_show(struct device *dev,
1975 struct device_attribute *attr, char *buf)
1976{
Alex Elder593a9e72012-02-07 12:03:37 -06001977 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001978
Alex Elder1dbb4392012-01-24 10:08:37 -06001979 return sprintf(buf, "client%lld\n",
1980 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001981}
1982
1983static ssize_t rbd_pool_show(struct device *dev,
1984 struct device_attribute *attr, char *buf)
1985{
Alex Elder593a9e72012-02-07 12:03:37 -06001986 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001987
1988 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1989}
1990
Alex Elder9bb2f332012-07-12 10:46:35 -05001991static ssize_t rbd_pool_id_show(struct device *dev,
1992 struct device_attribute *attr, char *buf)
1993{
1994 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1995
1996 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1997}
1998
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001999static ssize_t rbd_name_show(struct device *dev,
2000 struct device_attribute *attr, char *buf)
2001{
Alex Elder593a9e72012-02-07 12:03:37 -06002002 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002003
Alex Elder0bed54d2012-07-03 16:01:18 -05002004 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002005}
2006
2007static ssize_t rbd_snap_show(struct device *dev,
2008 struct device_attribute *attr,
2009 char *buf)
2010{
Alex Elder593a9e72012-02-07 12:03:37 -06002011 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002012
Alex Elderf84344f2012-08-31 17:29:51 -05002013 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002014}
2015
2016static ssize_t rbd_image_refresh(struct device *dev,
2017 struct device_attribute *attr,
2018 const char *buf,
2019 size_t size)
2020{
Alex Elder593a9e72012-02-07 12:03:37 -06002021 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002022 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002023
Alex Elder1fe5e992012-07-25 09:32:41 -05002024 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002025
2026 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002027}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002028
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002029static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2030static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2031static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2032static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002033static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002034static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2035static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2036static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2037static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002038
2039static struct attribute *rbd_attrs[] = {
2040 &dev_attr_size.attr,
2041 &dev_attr_major.attr,
2042 &dev_attr_client_id.attr,
2043 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002044 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002045 &dev_attr_name.attr,
2046 &dev_attr_current_snap.attr,
2047 &dev_attr_refresh.attr,
2048 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002049 NULL
2050};
2051
2052static struct attribute_group rbd_attr_group = {
2053 .attrs = rbd_attrs,
2054};
2055
2056static const struct attribute_group *rbd_attr_groups[] = {
2057 &rbd_attr_group,
2058 NULL
2059};
2060
2061static void rbd_sysfs_dev_release(struct device *dev)
2062{
2063}
2064
2065static struct device_type rbd_device_type = {
2066 .name = "rbd",
2067 .groups = rbd_attr_groups,
2068 .release = rbd_sysfs_dev_release,
2069};
2070
2071
2072/*
2073 sysfs - snapshots
2074*/
2075
2076static ssize_t rbd_snap_size_show(struct device *dev,
2077 struct device_attribute *attr,
2078 char *buf)
2079{
2080 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2081
Josh Durgin35915382011-12-05 18:25:13 -08002082 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002083}
2084
2085static ssize_t rbd_snap_id_show(struct device *dev,
2086 struct device_attribute *attr,
2087 char *buf)
2088{
2089 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2090
Josh Durgin35915382011-12-05 18:25:13 -08002091 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002092}
2093
2094static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2095static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2096
2097static struct attribute *rbd_snap_attrs[] = {
2098 &dev_attr_snap_size.attr,
2099 &dev_attr_snap_id.attr,
2100 NULL,
2101};
2102
2103static struct attribute_group rbd_snap_attr_group = {
2104 .attrs = rbd_snap_attrs,
2105};
2106
2107static void rbd_snap_dev_release(struct device *dev)
2108{
2109 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2110 kfree(snap->name);
2111 kfree(snap);
2112}
2113
2114static const struct attribute_group *rbd_snap_attr_groups[] = {
2115 &rbd_snap_attr_group,
2116 NULL
2117};
2118
2119static struct device_type rbd_snap_device_type = {
2120 .groups = rbd_snap_attr_groups,
2121 .release = rbd_snap_dev_release,
2122};
2123
Alex Elder14e70852012-07-19 09:09:27 -05002124static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002125{
2126 list_del(&snap->node);
2127 device_unregister(&snap->dev);
2128}
2129
Alex Elder14e70852012-07-19 09:09:27 -05002130static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002131 struct device *parent)
2132{
2133 struct device *dev = &snap->dev;
2134 int ret;
2135
2136 dev->type = &rbd_snap_device_type;
2137 dev->parent = parent;
2138 dev->release = rbd_snap_dev_release;
2139 dev_set_name(dev, "snap_%s", snap->name);
2140 ret = device_register(dev);
2141
2142 return ret;
2143}
2144
Alex Elder4e891e02012-07-10 20:30:10 -05002145static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2146 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002147{
Alex Elder4e891e02012-07-10 20:30:10 -05002148 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002149 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002150
2151 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002152 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002153 return ERR_PTR(-ENOMEM);
2154
2155 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002156 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002157 if (!snap->name)
2158 goto err;
2159
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002160 snap->size = rbd_dev->header.snap_sizes[i];
2161 snap->id = rbd_dev->header.snapc->snaps[i];
2162 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002163 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002164 if (ret < 0)
2165 goto err;
2166 }
Alex Elder4e891e02012-07-10 20:30:10 -05002167
2168 return snap;
2169
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002170err:
2171 kfree(snap->name);
2172 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002173
2174 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002175}
2176
2177/*
Alex Elder35938152012-08-02 11:29:46 -05002178 * Scan the rbd device's current snapshot list and compare it to the
2179 * newly-received snapshot context. Remove any existing snapshots
2180 * not present in the new snapshot context. Add a new snapshot for
2181 * any snaphots in the snapshot context not in the current list.
2182 * And verify there are no changes to snapshots we already know
2183 * about.
2184 *
2185 * Assumes the snapshots in the snapshot context are sorted by
2186 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2187 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002188 */
Alex Elder9fcbb802012-08-23 23:48:49 -05002189static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002190{
Alex Elder35938152012-08-02 11:29:46 -05002191 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2192 const u32 snap_count = snapc->num_snaps;
2193 char *snap_name = rbd_dev->header.snap_names;
2194 struct list_head *head = &rbd_dev->snaps;
2195 struct list_head *links = head->next;
2196 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002197
Alex Elder9fcbb802012-08-23 23:48:49 -05002198 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002199 while (index < snap_count || links != head) {
2200 u64 snap_id;
2201 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002202
Alex Elder35938152012-08-02 11:29:46 -05002203 snap_id = index < snap_count ? snapc->snaps[index]
2204 : CEPH_NOSNAP;
2205 snap = links != head ? list_entry(links, struct rbd_snap, node)
2206 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002207 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002208
Alex Elder35938152012-08-02 11:29:46 -05002209 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2210 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002211
Alex Elder35938152012-08-02 11:29:46 -05002212 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002213
Alex Elderf84344f2012-08-31 17:29:51 -05002214 if (rbd_dev->mapping.snap_id == snap->id)
2215 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002216 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002217 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002218 rbd_dev->mapping.snap_id == snap->id ?
2219 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002220 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002221
Alex Elder35938152012-08-02 11:29:46 -05002222 /* Done with this list entry; advance */
2223
2224 links = next;
2225 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002226 }
Alex Elder35938152012-08-02 11:29:46 -05002227
Alex Elder9fcbb802012-08-23 23:48:49 -05002228 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2229 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002230 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2231 struct rbd_snap *new_snap;
2232
2233 /* We haven't seen this snapshot before */
2234
2235 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2236 snap_name);
Alex Elder9fcbb802012-08-23 23:48:49 -05002237 if (IS_ERR(new_snap)) {
2238 int err = PTR_ERR(new_snap);
2239
2240 dout(" failed to add dev, error %d\n", err);
2241
2242 return err;
2243 }
Alex Elder35938152012-08-02 11:29:46 -05002244
2245 /* New goes before existing, or at end of list */
2246
Alex Elder9fcbb802012-08-23 23:48:49 -05002247 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002248 if (snap)
2249 list_add_tail(&new_snap->node, &snap->node);
2250 else
Alex Elder523f3252012-08-30 00:16:37 -05002251 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002252 } else {
2253 /* Already have this one */
2254
Alex Elder9fcbb802012-08-23 23:48:49 -05002255 dout(" already present\n");
2256
Alex Elderaafb2302012-09-06 16:00:54 -05002257 rbd_assert(snap->size ==
2258 rbd_dev->header.snap_sizes[index]);
2259 rbd_assert(!strcmp(snap->name, snap_name));
Alex Elder35938152012-08-02 11:29:46 -05002260
2261 /* Done with this list entry; advance */
2262
2263 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002264 }
Alex Elder35938152012-08-02 11:29:46 -05002265
2266 /* Advance to the next entry in the snapshot context */
2267
2268 index++;
2269 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002270 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002271 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002272
2273 return 0;
2274}
2275
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002276static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2277{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002278 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002279 struct device *dev;
2280 struct rbd_snap *snap;
2281
2282 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2283 dev = &rbd_dev->dev;
2284
2285 dev->bus = &rbd_bus_type;
2286 dev->type = &rbd_device_type;
2287 dev->parent = &rbd_root_dev;
2288 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002289 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002290 ret = device_register(dev);
2291 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002292 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002293
2294 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002295 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002296 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002297 break;
2298 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002299out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002300 mutex_unlock(&ctl_mutex);
2301 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002302}
2303
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002304static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2305{
2306 device_unregister(&rbd_dev->dev);
2307}
2308
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002309static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2310{
2311 int ret, rc;
2312
2313 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002314 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002315 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002316 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002317 if (rc < 0)
2318 return rc;
2319 }
2320 } while (ret == -ERANGE);
2321
2322 return ret;
2323}
2324
Alex Eldere2839302012-08-29 17:11:06 -05002325static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002326
2327/*
Alex Elder499afd52012-02-02 08:13:29 -06002328 * Get a unique rbd identifier for the given new rbd_dev, and add
2329 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002330 */
Alex Eldere2839302012-08-29 17:11:06 -05002331static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002332{
Alex Eldere2839302012-08-29 17:11:06 -05002333 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002334
2335 spin_lock(&rbd_dev_list_lock);
2336 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2337 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002338 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2339 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002340}
Alex Elderb7f23c32012-01-29 13:57:43 -06002341
Alex Elder1ddbe942012-01-29 13:57:44 -06002342/*
Alex Elder499afd52012-02-02 08:13:29 -06002343 * Remove an rbd_dev from the global list, and record that its
2344 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002345 */
Alex Eldere2839302012-08-29 17:11:06 -05002346static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002347{
Alex Elderd184f6b2012-01-29 13:57:44 -06002348 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002349 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002350 int max_id;
2351
Alex Elderaafb2302012-09-06 16:00:54 -05002352 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002353
Alex Eldere2839302012-08-29 17:11:06 -05002354 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2355 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002356 spin_lock(&rbd_dev_list_lock);
2357 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002358
2359 /*
2360 * If the id being "put" is not the current maximum, there
2361 * is nothing special we need to do.
2362 */
Alex Eldere2839302012-08-29 17:11:06 -05002363 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002364 spin_unlock(&rbd_dev_list_lock);
2365 return;
2366 }
2367
2368 /*
2369 * We need to update the current maximum id. Search the
2370 * list to find out what it is. We're more likely to find
2371 * the maximum at the end, so search the list backward.
2372 */
2373 max_id = 0;
2374 list_for_each_prev(tmp, &rbd_dev_list) {
2375 struct rbd_device *rbd_dev;
2376
2377 rbd_dev = list_entry(tmp, struct rbd_device, node);
2378 if (rbd_id > max_id)
2379 max_id = rbd_id;
2380 }
Alex Elder499afd52012-02-02 08:13:29 -06002381 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002382
Alex Elder1ddbe942012-01-29 13:57:44 -06002383 /*
Alex Eldere2839302012-08-29 17:11:06 -05002384 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002385 * which case it now accurately reflects the new maximum.
2386 * Be careful not to overwrite the maximum value in that
2387 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002388 */
Alex Eldere2839302012-08-29 17:11:06 -05002389 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2390 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002391}
2392
Alex Eldera725f65e2012-02-02 08:13:30 -06002393/*
Alex Eldere28fff262012-02-02 08:13:30 -06002394 * Skips over white space at *buf, and updates *buf to point to the
2395 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002396 * the token (string of non-white space characters) found. Note
2397 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002398 */
2399static inline size_t next_token(const char **buf)
2400{
2401 /*
2402 * These are the characters that produce nonzero for
2403 * isspace() in the "C" and "POSIX" locales.
2404 */
2405 const char *spaces = " \f\n\r\t\v";
2406
2407 *buf += strspn(*buf, spaces); /* Find start of token */
2408
2409 return strcspn(*buf, spaces); /* Return token length */
2410}
2411
2412/*
2413 * Finds the next token in *buf, and if the provided token buffer is
2414 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002415 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2416 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002417 *
2418 * Returns the length of the token found (not including the '\0').
2419 * Return value will be 0 if no token is found, and it will be >=
2420 * token_size if the token would not fit.
2421 *
Alex Elder593a9e72012-02-07 12:03:37 -06002422 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002423 * found token. Note that this occurs even if the token buffer is
2424 * too small to hold it.
2425 */
2426static inline size_t copy_token(const char **buf,
2427 char *token,
2428 size_t token_size)
2429{
2430 size_t len;
2431
2432 len = next_token(buf);
2433 if (len < token_size) {
2434 memcpy(token, *buf, len);
2435 *(token + len) = '\0';
2436 }
2437 *buf += len;
2438
2439 return len;
2440}
2441
2442/*
Alex Elderea3352f2012-07-09 21:04:23 -05002443 * Finds the next token in *buf, dynamically allocates a buffer big
2444 * enough to hold a copy of it, and copies the token into the new
2445 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2446 * that a duplicate buffer is created even for a zero-length token.
2447 *
2448 * Returns a pointer to the newly-allocated duplicate, or a null
2449 * pointer if memory for the duplicate was not available. If
2450 * the lenp argument is a non-null pointer, the length of the token
2451 * (not including the '\0') is returned in *lenp.
2452 *
2453 * If successful, the *buf pointer will be updated to point beyond
2454 * the end of the found token.
2455 *
2456 * Note: uses GFP_KERNEL for allocation.
2457 */
2458static inline char *dup_token(const char **buf, size_t *lenp)
2459{
2460 char *dup;
2461 size_t len;
2462
2463 len = next_token(buf);
2464 dup = kmalloc(len + 1, GFP_KERNEL);
2465 if (!dup)
2466 return NULL;
2467
2468 memcpy(dup, *buf, len);
2469 *(dup + len) = '\0';
2470 *buf += len;
2471
2472 if (lenp)
2473 *lenp = len;
2474
2475 return dup;
2476}
2477
2478/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002479 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002480 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2481 * on the list of monitor addresses and other options provided via
2482 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002483 *
2484 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002485 */
2486static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2487 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002488 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002489 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002490 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002491 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002492{
Alex Elderd22f76e2012-07-12 10:46:35 -05002493 size_t len;
2494 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002495
2496 /* The first four tokens are required */
2497
Alex Elder7ef32142012-02-02 08:13:30 -06002498 len = next_token(&buf);
2499 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002500 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002501 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002502 *mon_addrs = buf;
2503
2504 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002505
Alex Eldere28fff262012-02-02 08:13:30 -06002506 len = copy_token(&buf, options, options_size);
2507 if (!len || len >= options_size)
2508 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002509
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002510 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002511 rbd_dev->pool_name = dup_token(&buf, NULL);
2512 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002513 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002514
Alex Elder0bed54d2012-07-03 16:01:18 -05002515 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2516 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002517 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002518
Alex Eldercb8627c2012-07-09 21:04:23 -05002519 /* Create the name of the header object */
2520
Alex Elder0bed54d2012-07-03 16:01:18 -05002521 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002522 + sizeof (RBD_SUFFIX),
2523 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002524 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002525 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002526 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002527
Alex Eldere28fff262012-02-02 08:13:30 -06002528 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002529 * The snapshot name is optional. If none is is supplied,
2530 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002531 */
Alex Elderf84344f2012-08-31 17:29:51 -05002532 rbd_dev->mapping.snap_name = dup_token(&buf, &len);
2533 if (!rbd_dev->mapping.snap_name)
Alex Elder820a5f32012-07-09 21:04:24 -05002534 goto out_err;
2535 if (!len) {
2536 /* Replace the empty name with the default */
Alex Elderf84344f2012-08-31 17:29:51 -05002537 kfree(rbd_dev->mapping.snap_name);
2538 rbd_dev->mapping.snap_name
Alex Elder820a5f32012-07-09 21:04:24 -05002539 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
Alex Elderf84344f2012-08-31 17:29:51 -05002540 if (!rbd_dev->mapping.snap_name)
Alex Elder820a5f32012-07-09 21:04:24 -05002541 goto out_err;
2542
Alex Elderf84344f2012-08-31 17:29:51 -05002543 memcpy(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME,
Alex Eldere28fff262012-02-02 08:13:30 -06002544 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002545 }
Alex Eldere28fff262012-02-02 08:13:30 -06002546
Alex Eldera725f65e2012-02-02 08:13:30 -06002547 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002548
2549out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002550 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002551 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002552 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002553 rbd_dev->image_name = NULL;
2554 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002555 kfree(rbd_dev->pool_name);
2556 rbd_dev->pool_name = NULL;
2557
2558 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002559}
2560
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002561static ssize_t rbd_add(struct bus_type *bus,
2562 const char *buf,
2563 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002564{
Alex Eldercb8627c2012-07-09 21:04:23 -05002565 char *options;
2566 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002567 const char *mon_addrs = NULL;
2568 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002569 struct ceph_osd_client *osdc;
2570 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002571
2572 if (!try_module_get(THIS_MODULE))
2573 return -ENODEV;
2574
Alex Elder27cc2592012-02-02 08:13:30 -06002575 options = kmalloc(count, GFP_KERNEL);
2576 if (!options)
2577 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002578 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2579 if (!rbd_dev)
2580 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002581
2582 /* static rbd_device initialization */
2583 spin_lock_init(&rbd_dev->lock);
2584 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002585 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002586 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002587
Alex Elderd184f6b2012-01-29 13:57:44 -06002588 /* generate unique id: find highest unique id, add one */
Alex Eldere2839302012-08-29 17:11:06 -05002589 rbd_dev_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002590
Alex Eldera725f65e2012-02-02 08:13:30 -06002591 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002592 BUILD_BUG_ON(DEV_NAME_LEN
2593 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002594 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a82f2012-01-29 13:57:44 -06002595
Alex Eldera725f65e2012-02-02 08:13:30 -06002596 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002597 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002598 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002599 if (rc)
2600 goto err_put_id;
2601
Alex Elderf8c38922012-08-10 13:12:07 -07002602 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2603 if (rc < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002604 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002605
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002606 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002607 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002608 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2609 if (rc < 0)
2610 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002611 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002612
2613 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002614 rc = register_blkdev(0, rbd_dev->name);
2615 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002616 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002617 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002618
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002619 rc = rbd_bus_add_dev(rbd_dev);
2620 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002621 goto err_out_blkdev;
2622
Alex Elder32eec682012-02-08 16:11:14 -06002623 /*
2624 * At this point cleanup in the event of an error is the job
2625 * of the sysfs code (initiated by rbd_bus_del_dev()).
2626 *
2627 * Set up and announce blkdev mapping.
2628 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002629 rc = rbd_init_disk(rbd_dev);
2630 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002631 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002632
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002633 rc = rbd_init_watch_dev(rbd_dev);
2634 if (rc)
2635 goto err_out_bus;
2636
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002637 return count;
2638
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002639err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002640 /* this will also clean up rest of rbd_dev stuff */
2641
2642 rbd_bus_del_dev(rbd_dev);
2643 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002644 return rc;
2645
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002646err_out_blkdev:
2647 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2648err_out_client:
2649 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002650err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002651 if (rbd_dev->pool_name) {
Alex Elderf84344f2012-08-31 17:29:51 -05002652 kfree(rbd_dev->mapping.snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002653 kfree(rbd_dev->header_name);
2654 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002655 kfree(rbd_dev->pool_name);
2656 }
Alex Eldere2839302012-08-29 17:11:06 -05002657 rbd_dev_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002658err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002659 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002660 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002661
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002662 dout("Error adding device %s\n", buf);
2663 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002664
2665 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666}
2667
Alex Elderde71a292012-07-03 16:01:19 -05002668static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002669{
2670 struct list_head *tmp;
2671 struct rbd_device *rbd_dev;
2672
Alex Eldere124a82f2012-01-29 13:57:44 -06002673 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002674 list_for_each(tmp, &rbd_dev_list) {
2675 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002676 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06002677 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002678 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002679 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002680 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002681 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002682 return NULL;
2683}
2684
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002685static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002686{
Alex Elder593a9e72012-02-07 12:03:37 -06002687 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002688
Alex Elder1dbb4392012-01-24 10:08:37 -06002689 if (rbd_dev->watch_request) {
2690 struct ceph_client *client = rbd_dev->rbd_client->client;
2691
2692 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002693 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002694 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002695 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002696 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002697
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002698 rbd_put_client(rbd_dev);
2699
2700 /* clean up and free blkdev */
2701 rbd_free_disk(rbd_dev);
2702 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002703
2704 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05002705 kfree(rbd_dev->mapping.snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002706 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002707 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002708 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05002709 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002710 kfree(rbd_dev);
2711
2712 /* release module ref */
2713 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002714}
2715
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002716static ssize_t rbd_remove(struct bus_type *bus,
2717 const char *buf,
2718 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002719{
2720 struct rbd_device *rbd_dev = NULL;
2721 int target_id, rc;
2722 unsigned long ul;
2723 int ret = count;
2724
2725 rc = strict_strtoul(buf, 10, &ul);
2726 if (rc)
2727 return rc;
2728
2729 /* convert to int; abort if we lost anything in the conversion */
2730 target_id = (int) ul;
2731 if (target_id != ul)
2732 return -EINVAL;
2733
2734 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2735
2736 rbd_dev = __rbd_get_dev(target_id);
2737 if (!rbd_dev) {
2738 ret = -ENOENT;
2739 goto done;
2740 }
2741
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002742 __rbd_remove_all_snaps(rbd_dev);
2743 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002744
2745done:
2746 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05002747
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002748 return ret;
2749}
2750
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002751static ssize_t rbd_snap_add(struct device *dev,
2752 struct device_attribute *attr,
2753 const char *buf,
2754 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002755{
Alex Elder593a9e72012-02-07 12:03:37 -06002756 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002757 int ret;
2758 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002759 if (!name)
2760 return -ENOMEM;
2761
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002762 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002763
2764 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2765
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002766 ret = rbd_header_add_snap(rbd_dev,
2767 name, GFP_KERNEL);
2768 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002769 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002770
Alex Elderb8136232012-07-25 09:32:41 -05002771 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002772 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002773 goto err_unlock;
2774
2775 /* shouldn't hold ctl_mutex when notifying.. notify might
2776 trigger a watch callback that would need to get that mutex */
2777 mutex_unlock(&ctl_mutex);
2778
2779 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002780 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002781
2782 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002783 kfree(name);
2784 return ret;
2785
2786err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002787 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002788 kfree(name);
2789 return ret;
2790}
2791
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002792/*
2793 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002794 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002795 */
2796static int rbd_sysfs_init(void)
2797{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002798 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002799
Alex Elderfed4c142012-02-07 12:03:36 -06002800 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002801 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002802 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002803
Alex Elderfed4c142012-02-07 12:03:36 -06002804 ret = bus_register(&rbd_bus_type);
2805 if (ret < 0)
2806 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002807
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002808 return ret;
2809}
2810
2811static void rbd_sysfs_cleanup(void)
2812{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002813 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002814 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002815}
2816
2817int __init rbd_init(void)
2818{
2819 int rc;
2820
2821 rc = rbd_sysfs_init();
2822 if (rc)
2823 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002824 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002825 return 0;
2826}
2827
2828void __exit rbd_exit(void)
2829{
2830 rbd_sysfs_cleanup();
2831}
2832
2833module_init(rbd_init);
2834module_exit(rbd_exit);
2835
2836MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2837MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2838MODULE_DESCRIPTION("rados block device");
2839
2840/* following authorship retained from original osdblk.c */
2841MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2842
2843MODULE_LICENSE("GPL");