blob: 23fa962fea36f60c857c7f4ea8938132f29e749b [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder81a89792012-02-02 08:13:30 -060069/*
70 * An RBD device name will be "rbd#", where the "rbd" comes from
71 * RBD_DRV_NAME above, and # is a unique integer identifier.
72 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
73 * enough to hold all possible device names.
74 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070075#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060076#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070077
Alex Eldercc0538b2012-08-10 13:12:07 -070078#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070079
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080/*
81 * block device image metadata (in-memory version)
82 */
83struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050084 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050085 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070086 __u8 obj_order;
87 __u8 crypt_type;
88 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070089
Alex Elderf84344f2012-08-31 17:29:51 -050090 /* The remaining fields need to be updated occasionally */
91 u64 image_size;
92 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093 char *snap_names;
94 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070095
96 u64 obj_version;
97};
98
99struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700100 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101};
102
103/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600104 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 */
106struct rbd_client {
107 struct ceph_client *client;
108 struct kref kref;
109 struct list_head node;
110};
111
112/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600113 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700114 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700115struct rbd_req_status {
116 int done;
117 int rc;
118 u64 bytes;
119};
120
121/*
122 * a collection of requests
123 */
124struct rbd_req_coll {
125 int total;
126 int num_done;
127 struct kref kref;
128 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700129};
130
Alex Elderf0f8cef2012-01-29 13:57:44 -0600131/*
132 * a single io request
133 */
134struct rbd_request {
135 struct request *rq; /* blk layer request */
136 struct bio *bio; /* cloned bio */
137 struct page **pages; /* list of used pages */
138 u64 len;
139 int coll_index;
140 struct rbd_req_coll *coll;
141};
142
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800143struct rbd_snap {
144 struct device dev;
145 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800146 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800147 struct list_head node;
148 u64 id;
149};
150
Alex Elderf84344f2012-08-31 17:29:51 -0500151struct rbd_mapping {
152 char *snap_name;
153 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500154 u64 size;
Alex Elderf84344f2012-08-31 17:29:51 -0500155 bool snap_exists;
156 bool read_only;
157};
158
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700159/*
160 * a single device
161 */
162struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500163 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700164
165 int major; /* blkdev assigned major */
166 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700167
Alex Elderf8c38922012-08-10 13:12:07 -0700168 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169 struct rbd_client *rbd_client;
170
171 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
172
173 spinlock_t lock; /* queue lock */
174
175 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500176 char *image_name;
177 size_t image_name_len;
178 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500179 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500180 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700182 struct ceph_osd_event *watch_event;
183 struct ceph_osd_request *watch_request;
184
Josh Durginc6666012011-11-21 17:11:12 -0800185 /* protects updating the header */
186 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500187
188 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700189
190 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800191
192 /* list of snapshots */
193 struct list_head snaps;
194
195 /* sysfs related */
196 struct device dev;
197};
198
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600200
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700201static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600202static DEFINE_SPINLOCK(rbd_dev_list_lock);
203
Alex Elder432b8582012-01-29 13:57:44 -0600204static LIST_HEAD(rbd_client_list); /* clients */
205static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700206
Alex Elder9fcbb802012-08-23 23:48:49 -0500207static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800208static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800209static ssize_t rbd_snap_add(struct device *dev,
210 struct device_attribute *attr,
211 const char *buf,
212 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500213static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800214
Alex Elderf0f8cef2012-01-29 13:57:44 -0600215static ssize_t rbd_add(struct bus_type *bus, const char *buf,
216 size_t count);
217static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
218 size_t count);
219
220static struct bus_attribute rbd_bus_attrs[] = {
221 __ATTR(add, S_IWUSR, NULL, rbd_add),
222 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
223 __ATTR_NULL
224};
225
226static struct bus_type rbd_bus_type = {
227 .name = "rbd",
228 .bus_attrs = rbd_bus_attrs,
229};
230
231static void rbd_root_dev_release(struct device *dev)
232{
233}
234
235static struct device rbd_root_dev = {
236 .init_name = "rbd",
237 .release = rbd_root_dev_release,
238};
239
Alex Elderaafb230e2012-09-06 16:00:54 -0500240#ifdef RBD_DEBUG
241#define rbd_assert(expr) \
242 if (unlikely(!(expr))) { \
243 printk(KERN_ERR "\nAssertion failure in %s() " \
244 "at line %d:\n\n" \
245 "\trbd_assert(%s);\n\n", \
246 __func__, __LINE__, #expr); \
247 BUG(); \
248 }
249#else /* !RBD_DEBUG */
250# define rbd_assert(expr) ((void) 0)
251#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800252
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800253static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
254{
255 return get_device(&rbd_dev->dev);
256}
257
258static void rbd_put_dev(struct rbd_device *rbd_dev)
259{
260 put_device(&rbd_dev->dev);
261}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700262
Alex Elder1fe5e992012-07-25 09:32:41 -0500263static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700264
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700265static int rbd_open(struct block_device *bdev, fmode_t mode)
266{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600267 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700268
Alex Elderf84344f2012-08-31 17:29:51 -0500269 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270 return -EROFS;
271
Alex Elder340c7a22012-08-10 13:12:07 -0700272 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500273 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700274
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700275 return 0;
276}
277
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800278static int rbd_release(struct gendisk *disk, fmode_t mode)
279{
280 struct rbd_device *rbd_dev = disk->private_data;
281
282 rbd_put_dev(rbd_dev);
283
284 return 0;
285}
286
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700287static const struct block_device_operations rbd_bd_ops = {
288 .owner = THIS_MODULE,
289 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800290 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700291};
292
293/*
294 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500295 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296 */
Alex Elderf8c38922012-08-10 13:12:07 -0700297static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700298{
299 struct rbd_client *rbdc;
300 int ret = -ENOMEM;
301
302 dout("rbd_client_create\n");
303 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
304 if (!rbdc)
305 goto out_opt;
306
307 kref_init(&rbdc->kref);
308 INIT_LIST_HEAD(&rbdc->node);
309
Alex Elderbc534d82012-01-29 13:57:44 -0600310 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
311
Alex Elder43ae4702012-07-03 16:01:18 -0500312 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700313 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600314 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500315 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316
317 ret = ceph_open_session(rbdc->client);
318 if (ret < 0)
319 goto out_err;
320
Alex Elder432b8582012-01-29 13:57:44 -0600321 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600323 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700324
Alex Elderbc534d82012-01-29 13:57:44 -0600325 mutex_unlock(&ctl_mutex);
326
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327 dout("rbd_client_create created %p\n", rbdc);
328 return rbdc;
329
330out_err:
331 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600332out_mutex:
333 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 kfree(rbdc);
335out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500336 if (ceph_opts)
337 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400338 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339}
340
341/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700342 * Find a ceph client with specific addr and configuration. If
343 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700345static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346{
347 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700348 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349
Alex Elder43ae4702012-07-03 16:01:18 -0500350 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351 return NULL;
352
Alex Elder1f7ba332012-08-10 13:12:07 -0700353 spin_lock(&rbd_client_list_lock);
354 list_for_each_entry(client_node, &rbd_client_list, node) {
355 if (!ceph_compare_options(ceph_opts, client_node->client)) {
356 kref_get(&client_node->kref);
357 found = true;
358 break;
359 }
360 }
361 spin_unlock(&rbd_client_list_lock);
362
363 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364}
365
366/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700367 * mount options
368 */
369enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700370 Opt_last_int,
371 /* int args above */
372 Opt_last_string,
373 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700374 Opt_read_only,
375 Opt_read_write,
376 /* Boolean args above */
377 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700378};
379
Alex Elder43ae4702012-07-03 16:01:18 -0500380static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700381 /* int args above */
382 /* string args above */
Alex Elderf84344f2012-08-31 17:29:51 -0500383 {Opt_read_only, "mapping.read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700384 {Opt_read_only, "ro"}, /* Alternate spelling */
385 {Opt_read_write, "read_write"},
386 {Opt_read_write, "rw"}, /* Alternate spelling */
387 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700388 {-1, NULL}
389};
390
391static int parse_rbd_opts_token(char *c, void *private)
392{
Alex Elder43ae4702012-07-03 16:01:18 -0500393 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700394 substring_t argstr[MAX_OPT_ARGS];
395 int token, intval, ret;
396
Alex Elder43ae4702012-07-03 16:01:18 -0500397 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398 if (token < 0)
399 return -EINVAL;
400
401 if (token < Opt_last_int) {
402 ret = match_int(&argstr[0], &intval);
403 if (ret < 0) {
404 pr_err("bad mount option arg (not int) "
405 "at '%s'\n", c);
406 return ret;
407 }
408 dout("got int token %d val %d\n", token, intval);
409 } else if (token > Opt_last_int && token < Opt_last_string) {
410 dout("got string token %d val %s\n", token,
411 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700412 } else if (token > Opt_last_string && token < Opt_last_bool) {
413 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700414 } else {
415 dout("got token %d\n", token);
416 }
417
418 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700419 case Opt_read_only:
420 rbd_opts->read_only = true;
421 break;
422 case Opt_read_write:
423 rbd_opts->read_only = false;
424 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700425 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500426 rbd_assert(false);
427 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700428 }
429 return 0;
430}
431
432/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433 * Get a ceph client with specific addr and configuration, if one does
434 * not exist create it.
435 */
Alex Elderf8c38922012-08-10 13:12:07 -0700436static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
437 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438{
Alex Elderf8c38922012-08-10 13:12:07 -0700439 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500440 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700441 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700442
Alex Eldercc0538b2012-08-10 13:12:07 -0700443 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444
Alex Elder43ae4702012-07-03 16:01:18 -0500445 ceph_opts = ceph_parse_options(options, mon_addr,
446 mon_addr + mon_addr_len,
447 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700448 if (IS_ERR(ceph_opts))
449 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450
Alex Elder1f7ba332012-08-10 13:12:07 -0700451 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600453 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500454 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700455 } else {
456 rbdc = rbd_client_create(ceph_opts);
457 if (IS_ERR(rbdc))
458 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700459 }
Alex Elderf8c38922012-08-10 13:12:07 -0700460 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700461
Alex Elderf8c38922012-08-10 13:12:07 -0700462 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463}
464
465/*
466 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600467 *
Alex Elder432b8582012-01-29 13:57:44 -0600468 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 */
470static void rbd_client_release(struct kref *kref)
471{
472 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
473
474 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500475 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700476 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500477 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700478
479 ceph_destroy_client(rbdc->client);
480 kfree(rbdc);
481}
482
483/*
484 * Drop reference to ceph client node. If it's not referenced anymore, release
485 * it.
486 */
487static void rbd_put_client(struct rbd_device *rbd_dev)
488{
489 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
490 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700491}
492
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700493/*
494 * Destroy requests collection
495 */
496static void rbd_coll_release(struct kref *kref)
497{
498 struct rbd_req_coll *coll =
499 container_of(kref, struct rbd_req_coll, kref);
500
501 dout("rbd_coll_release %p\n", coll);
502 kfree(coll);
503}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700504
Alex Elder8e94af82012-07-25 09:32:40 -0500505static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
506{
Alex Elder103a1502012-08-02 11:29:45 -0500507 size_t size;
508 u32 snap_count;
509
510 /* The header has to start with the magic rbd header text */
511 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
512 return false;
513
514 /*
515 * The size of a snapshot header has to fit in a size_t, and
516 * that limits the number of snapshots.
517 */
518 snap_count = le32_to_cpu(ondisk->snap_count);
519 size = SIZE_MAX - sizeof (struct ceph_snap_context);
520 if (snap_count > size / sizeof (__le64))
521 return false;
522
523 /*
524 * Not only that, but the size of the entire the snapshot
525 * header must also be representable in a size_t.
526 */
527 size -= snap_count * sizeof (__le64);
528 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
529 return false;
530
531 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500532}
533
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700534/*
535 * Create a new header structure, translate header format from the on-disk
536 * header.
537 */
538static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500539 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540{
Alex Elderccece232012-07-10 20:30:10 -0500541 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500542 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500543 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500544 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545
Alex Elder6a523252012-07-19 17:12:59 -0500546 memset(header, 0, sizeof (*header));
547
Alex Elder103a1502012-08-02 11:29:45 -0500548 snap_count = le32_to_cpu(ondisk->snap_count);
549
Alex Elder58c17b02012-08-23 23:22:06 -0500550 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
551 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500552 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700553 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500554 memcpy(header->object_prefix, ondisk->object_prefix, len);
555 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600556
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500558 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
559
Alex Elder621901d2012-08-23 23:22:06 -0500560 /* Save a copy of the snapshot names */
561
Alex Elderf785cc12012-08-23 23:22:06 -0500562 if (snap_names_len > (u64) SIZE_MAX)
563 return -EIO;
564 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500566 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500567 /*
568 * Note that rbd_dev_v1_header_read() guarantees
569 * the ondisk buffer we're working with has
570 * snap_names_len bytes beyond the end of the
571 * snapshot id array, this memcpy() is safe.
572 */
573 memcpy(header->snap_names, &ondisk->snaps[snap_count],
574 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500575
Alex Elder621901d2012-08-23 23:22:06 -0500576 /* Record each snapshot's size */
577
Alex Elderd2bb24e2012-07-26 23:37:14 -0500578 size = snap_count * sizeof (*header->snap_sizes);
579 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700580 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500581 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500582 for (i = 0; i < snap_count; i++)
583 header->snap_sizes[i] =
584 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 } else {
Alex Elderccece232012-07-10 20:30:10 -0500586 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587 header->snap_names = NULL;
588 header->snap_sizes = NULL;
589 }
Alex Elder849b4262012-07-09 21:04:24 -0500590
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700591 header->obj_order = ondisk->options.order;
592 header->crypt_type = ondisk->options.crypt_type;
593 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500594
Alex Elder621901d2012-08-23 23:22:06 -0500595 /* Allocate and fill in the snapshot context */
596
Alex Elderf84344f2012-08-31 17:29:51 -0500597 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500598 size = sizeof (struct ceph_snap_context);
599 size += snap_count * sizeof (header->snapc->snaps[0]);
600 header->snapc = kzalloc(size, GFP_KERNEL);
601 if (!header->snapc)
602 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700603
604 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500605 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500607 for (i = 0; i < snap_count; i++)
608 header->snapc->snaps[i] =
609 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610
611 return 0;
612
Alex Elder6a523252012-07-19 17:12:59 -0500613out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500614 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500615 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500617 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500618 kfree(header->object_prefix);
619 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500620
Alex Elder00f1f362012-02-07 12:03:36 -0600621 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622}
623
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
625 u64 *seq, u64 *size)
626{
627 int i;
628 char *p = header->snap_names;
629
Alex Elderc9aadfe2012-08-30 14:42:15 -0500630 rbd_assert(header->snapc != NULL);
631 for (i = 0; i < header->snapc->num_snaps; i++) {
Alex Elder00f1f362012-02-07 12:03:36 -0600632 if (!strcmp(snap_name, p)) {
633
634 /* Found it. Pass back its id and/or size */
635
636 if (seq)
637 *seq = header->snapc->snaps[i];
638 if (size)
639 *size = header->snap_sizes[i];
640 return i;
641 }
642 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643 }
Alex Elder00f1f362012-02-07 12:03:36 -0600644 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700645}
646
Alex Elder4e1105a2012-08-31 17:29:52 -0500647static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648{
Alex Elder78dc4472012-07-19 08:49:18 -0500649 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650
Alex Elder0ce1a792012-07-03 16:01:18 -0500651 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652
Alex Elder4e1105a2012-08-31 17:29:52 -0500653 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800654 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500655 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500656 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elderf84344f2012-08-31 17:29:51 -0500657 rbd_dev->mapping.snap_exists = false;
658 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659 } else {
Alex Elder4e1105a2012-08-31 17:29:52 -0500660 ret = snap_by_name(&rbd_dev->header, snap_name,
Alex Elder99c1f082012-08-30 14:42:15 -0500661 &rbd_dev->mapping.snap_id,
662 &rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 if (ret < 0)
664 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500665 rbd_dev->mapping.snap_exists = true;
666 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500668 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669
670 ret = 0;
671done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500672 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673 return ret;
674}
675
676static void rbd_header_free(struct rbd_image_header *header)
677{
Alex Elder849b4262012-07-09 21:04:24 -0500678 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500679 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500681 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500682 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500683 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800684 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500685 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686}
687
Alex Elder65ccfe22012-08-09 10:33:26 -0700688static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689{
Alex Elder65ccfe22012-08-09 10:33:26 -0700690 char *name;
691 u64 segment;
692 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700693
Alex Elder65ccfe22012-08-09 10:33:26 -0700694 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
695 if (!name)
696 return NULL;
697 segment = offset >> rbd_dev->header.obj_order;
698 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
699 rbd_dev->header.object_prefix, segment);
700 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
701 pr_err("error formatting segment name for #%llu (%d)\n",
702 segment, ret);
703 kfree(name);
704 name = NULL;
705 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706
Alex Elder65ccfe22012-08-09 10:33:26 -0700707 return name;
708}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709
Alex Elder65ccfe22012-08-09 10:33:26 -0700710static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
711{
712 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713
Alex Elder65ccfe22012-08-09 10:33:26 -0700714 return offset & (segment_size - 1);
715}
716
717static u64 rbd_segment_length(struct rbd_device *rbd_dev,
718 u64 offset, u64 length)
719{
720 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
721
722 offset &= segment_size - 1;
723
Alex Elderaafb230e2012-09-06 16:00:54 -0500724 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700725 if (offset + length > segment_size)
726 length = segment_size - offset;
727
728 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729}
730
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700731static int rbd_get_num_segments(struct rbd_image_header *header,
732 u64 ofs, u64 len)
733{
Alex Elderdf111be2012-08-09 10:33:26 -0700734 u64 start_seg;
735 u64 end_seg;
736
737 if (!len)
738 return 0;
739 if (len - 1 > U64_MAX - ofs)
740 return -ERANGE;
741
742 start_seg = ofs >> header->obj_order;
743 end_seg = (ofs + len - 1) >> header->obj_order;
744
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700745 return end_seg - start_seg + 1;
746}
747
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700748/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700749 * returns the size of an object in the image
750 */
751static u64 rbd_obj_bytes(struct rbd_image_header *header)
752{
753 return 1 << header->obj_order;
754}
755
756/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700757 * bio helpers
758 */
759
760static void bio_chain_put(struct bio *chain)
761{
762 struct bio *tmp;
763
764 while (chain) {
765 tmp = chain;
766 chain = chain->bi_next;
767 bio_put(tmp);
768 }
769}
770
771/*
772 * zeros a bio chain, starting at specific offset
773 */
774static void zero_bio_chain(struct bio *chain, int start_ofs)
775{
776 struct bio_vec *bv;
777 unsigned long flags;
778 void *buf;
779 int i;
780 int pos = 0;
781
782 while (chain) {
783 bio_for_each_segment(bv, chain, i) {
784 if (pos + bv->bv_len > start_ofs) {
785 int remainder = max(start_ofs - pos, 0);
786 buf = bvec_kmap_irq(bv, &flags);
787 memset(buf + remainder, 0,
788 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200789 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700790 }
791 pos += bv->bv_len;
792 }
793
794 chain = chain->bi_next;
795 }
796}
797
798/*
799 * bio_chain_clone - clone a chain of bios up to a certain length.
800 * might return a bio_pair that will need to be released.
801 */
802static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
803 struct bio_pair **bp,
804 int len, gfp_t gfpmask)
805{
Alex Elder542582f2012-08-09 10:33:25 -0700806 struct bio *old_chain = *old;
807 struct bio *new_chain = NULL;
808 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700809 int total = 0;
810
811 if (*bp) {
812 bio_pair_release(*bp);
813 *bp = NULL;
814 }
815
816 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700817 struct bio *tmp;
818
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700819 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
820 if (!tmp)
821 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700822 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700823
824 if (total + old_chain->bi_size > len) {
825 struct bio_pair *bp;
826
827 /*
828 * this split can only happen with a single paged bio,
829 * split_bio will BUG_ON if this is not the case
830 */
831 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500832 "bi_size=%u\n",
833 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834
835 /* split the bio. We'll release it either in the next
836 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600837 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700838 if (!bp)
839 goto err_out;
840
841 __bio_clone(tmp, &bp->bio1);
842
843 *next = &bp->bio2;
844 } else {
845 __bio_clone(tmp, old_chain);
846 *next = old_chain->bi_next;
847 }
848
849 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700850 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700851 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700852 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700853 else
854 new_chain = tmp;
855 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700856 old_chain = old_chain->bi_next;
857
858 total += tmp->bi_size;
859 }
860
Alex Elderaafb230e2012-09-06 16:00:54 -0500861 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700862
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700863 *old = old_chain;
864
865 return new_chain;
866
867err_out:
868 dout("bio_chain_clone with err\n");
869 bio_chain_put(new_chain);
870 return NULL;
871}
872
873/*
874 * helpers for osd request op vectors.
875 */
Alex Elder57cfc102012-06-26 12:57:03 -0700876static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
877 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878{
Alex Elder57cfc102012-06-26 12:57:03 -0700879 struct ceph_osd_req_op *ops;
880
881 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
882 if (!ops)
883 return NULL;
884
885 ops[0].op = opcode;
886
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 /*
888 * op extent offset and length will be set later on
889 * in calc_raw_layout()
890 */
Alex Elder57cfc102012-06-26 12:57:03 -0700891 ops[0].payload_len = payload_len;
892
893 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700894}
895
896static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
897{
898 kfree(ops);
899}
900
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700901static void rbd_coll_end_req_index(struct request *rq,
902 struct rbd_req_coll *coll,
903 int index,
904 int ret, u64 len)
905{
906 struct request_queue *q;
907 int min, max, i;
908
Alex Elderbd919d42012-07-13 20:35:11 -0500909 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
910 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700911
912 if (!rq)
913 return;
914
915 if (!coll) {
916 blk_end_request(rq, ret, len);
917 return;
918 }
919
920 q = rq->q;
921
922 spin_lock_irq(q->queue_lock);
923 coll->status[index].done = 1;
924 coll->status[index].rc = ret;
925 coll->status[index].bytes = len;
926 max = min = coll->num_done;
927 while (max < coll->total && coll->status[max].done)
928 max++;
929
930 for (i = min; i<max; i++) {
931 __blk_end_request(rq, coll->status[i].rc,
932 coll->status[i].bytes);
933 coll->num_done++;
934 kref_put(&coll->kref, rbd_coll_release);
935 }
936 spin_unlock_irq(q->queue_lock);
937}
938
939static void rbd_coll_end_req(struct rbd_request *req,
940 int ret, u64 len)
941{
942 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
943}
944
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700945/*
946 * Send ceph osd request
947 */
948static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500949 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700950 struct ceph_snap_context *snapc,
951 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500952 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700953 struct bio *bio,
954 struct page **pages,
955 int num_pages,
956 int flags,
957 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700958 struct rbd_req_coll *coll,
959 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700960 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700961 struct ceph_msg *msg),
962 struct ceph_osd_request **linger_req,
963 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700964{
965 struct ceph_osd_request *req;
966 struct ceph_file_layout *layout;
967 int ret;
968 u64 bno;
969 struct timespec mtime = CURRENT_TIME;
970 struct rbd_request *req_data;
971 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600972 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700973
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700974 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700975 if (!req_data) {
976 if (coll)
977 rbd_coll_end_req_index(rq, coll, coll_index,
978 -ENOMEM, len);
979 return -ENOMEM;
980 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700981
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700982 if (coll) {
983 req_data->coll = coll;
984 req_data->coll_index = coll_index;
985 }
986
Alex Elderbd919d42012-07-13 20:35:11 -0500987 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
988 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700989
Alex Elder0ce1a792012-07-03 16:01:18 -0500990 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600991 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
992 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700993 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700994 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995 goto done_pages;
996 }
997
998 req->r_callback = rbd_cb;
999
1000 req_data->rq = rq;
1001 req_data->bio = bio;
1002 req_data->pages = pages;
1003 req_data->len = len;
1004
1005 req->r_priv = req_data;
1006
1007 reqhead = req->r_request->front.iov_base;
1008 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1009
Alex Elderaded07e2012-07-03 16:01:18 -05001010 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001011 req->r_oid_len = strlen(req->r_oid);
1012
1013 layout = &req->r_file_layout;
1014 memset(layout, 0, sizeof(*layout));
1015 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1016 layout->fl_stripe_count = cpu_to_le32(1);
1017 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001018 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001019 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1020 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001021
1022 ceph_osdc_build_request(req, ofs, &len,
1023 ops,
1024 snapc,
1025 &mtime,
1026 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001027
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001028 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001029 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001030 *linger_req = req;
1031 }
1032
Alex Elder1dbb4392012-01-24 10:08:37 -06001033 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001034 if (ret < 0)
1035 goto done_err;
1036
1037 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001038 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001039 if (ver)
1040 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001041 dout("reassert_ver=%llu\n",
1042 (unsigned long long)
1043 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001044 ceph_osdc_put_request(req);
1045 }
1046 return ret;
1047
1048done_err:
1049 bio_chain_put(req_data->bio);
1050 ceph_osdc_put_request(req);
1051done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001052 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001053 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054 return ret;
1055}
1056
1057/*
1058 * Ceph osd op callback
1059 */
1060static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1061{
1062 struct rbd_request *req_data = req->r_priv;
1063 struct ceph_osd_reply_head *replyhead;
1064 struct ceph_osd_op *op;
1065 __s32 rc;
1066 u64 bytes;
1067 int read_op;
1068
1069 /* parse reply */
1070 replyhead = msg->front.iov_base;
1071 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1072 op = (void *)(replyhead + 1);
1073 rc = le32_to_cpu(replyhead->result);
1074 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001075 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001076
Alex Elderbd919d42012-07-13 20:35:11 -05001077 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1078 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001079
1080 if (rc == -ENOENT && read_op) {
1081 zero_bio_chain(req_data->bio, 0);
1082 rc = 0;
1083 } else if (rc == 0 && read_op && bytes < req_data->len) {
1084 zero_bio_chain(req_data->bio, bytes);
1085 bytes = req_data->len;
1086 }
1087
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001088 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001089
1090 if (req_data->bio)
1091 bio_chain_put(req_data->bio);
1092
1093 ceph_osdc_put_request(req);
1094 kfree(req_data);
1095}
1096
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001097static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1098{
1099 ceph_osdc_put_request(req);
1100}
1101
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102/*
1103 * Do a synchronous ceph osd operation
1104 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001105static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106 struct ceph_snap_context *snapc,
1107 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001108 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001109 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001110 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001111 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001112 char *buf,
1113 struct ceph_osd_request **linger_req,
1114 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001115{
1116 int ret;
1117 struct page **pages;
1118 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001119
Alex Elderaafb230e2012-09-06 16:00:54 -05001120 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001121
1122 num_pages = calc_pages_for(ofs , len);
1123 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001124 if (IS_ERR(pages))
1125 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001126
Alex Elder0ce1a792012-07-03 16:01:18 -05001127 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001128 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129 pages, num_pages,
1130 flags,
1131 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001132 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001133 NULL,
1134 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001136 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137
1138 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1139 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1140
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001141done:
1142 ceph_release_page_vector(pages, num_pages);
1143 return ret;
1144}
1145
1146/*
1147 * Do an asynchronous ceph osd operation
1148 */
1149static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001150 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001151 struct ceph_snap_context *snapc,
1152 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001153 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001155 struct bio *bio,
1156 struct rbd_req_coll *coll,
1157 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001158{
1159 char *seg_name;
1160 u64 seg_ofs;
1161 u64 seg_len;
1162 int ret;
1163 struct ceph_osd_req_op *ops;
1164 u32 payload_len;
1165
Alex Elder65ccfe22012-08-09 10:33:26 -07001166 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167 if (!seg_name)
1168 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001169 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1170 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171
1172 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1173
Alex Elder57cfc102012-06-26 12:57:03 -07001174 ret = -ENOMEM;
1175 ops = rbd_create_rw_ops(1, opcode, payload_len);
1176 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001177 goto done;
1178
1179 /* we've taken care of segment sizes earlier when we
1180 cloned the bios. We should never have a segment
1181 truncated at this point */
Alex Elderaafb230e2012-09-06 16:00:54 -05001182 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001183
1184 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1185 seg_name, seg_ofs, seg_len,
1186 bio,
1187 NULL, 0,
1188 flags,
1189 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001190 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001191 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001192
1193 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001194done:
1195 kfree(seg_name);
1196 return ret;
1197}
1198
1199/*
1200 * Request async osd write
1201 */
1202static int rbd_req_write(struct request *rq,
1203 struct rbd_device *rbd_dev,
1204 struct ceph_snap_context *snapc,
1205 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001206 struct bio *bio,
1207 struct rbd_req_coll *coll,
1208 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001209{
1210 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1211 CEPH_OSD_OP_WRITE,
1212 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001213 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001214}
1215
1216/*
1217 * Request async osd read
1218 */
1219static int rbd_req_read(struct request *rq,
1220 struct rbd_device *rbd_dev,
1221 u64 snapid,
1222 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001223 struct bio *bio,
1224 struct rbd_req_coll *coll,
1225 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226{
1227 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001228 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001229 CEPH_OSD_OP_READ,
1230 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001231 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001232}
1233
1234/*
1235 * Request sync osd read
1236 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001237static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001238 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001239 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001240 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001241 char *buf,
1242 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001243{
Alex Elder913d2fd2012-06-26 12:57:03 -07001244 struct ceph_osd_req_op *ops;
1245 int ret;
1246
1247 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1248 if (!ops)
1249 return -ENOMEM;
1250
1251 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001252 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001253 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001254 ops, object_name, ofs, len, buf, NULL, ver);
1255 rbd_destroy_ops(ops);
1256
1257 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001258}
1259
1260/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001261 * Request sync osd watch
1262 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001263static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001264 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001265 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001266{
1267 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001268 int ret;
1269
Alex Elder57cfc102012-06-26 12:57:03 -07001270 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1271 if (!ops)
1272 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001273
Josh Durgina71b8912011-12-05 18:10:44 -08001274 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001275 ops[0].watch.cookie = notify_id;
1276 ops[0].watch.flag = 0;
1277
Alex Elder0ce1a792012-07-03 16:01:18 -05001278 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001279 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001280 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001281 CEPH_OSD_FLAG_READ,
1282 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001283 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001284 rbd_simple_req_cb, 0, NULL);
1285
1286 rbd_destroy_ops(ops);
1287 return ret;
1288}
1289
1290static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1291{
Alex Elder0ce1a792012-07-03 16:01:18 -05001292 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001293 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001294 int rc;
1295
Alex Elder0ce1a792012-07-03 16:01:18 -05001296 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001297 return;
1298
Alex Elderbd919d42012-07-13 20:35:11 -05001299 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1300 rbd_dev->header_name, (unsigned long long) notify_id,
1301 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001302 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001303 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001304 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001305 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001306
Alex Elder7f0a24d2012-07-25 09:32:40 -05001307 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001308}
1309
1310/*
1311 * Request sync osd watch
1312 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001313static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001314{
1315 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001316 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001317 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001318
Alex Elder57cfc102012-06-26 12:57:03 -07001319 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1320 if (!ops)
1321 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001322
1323 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001324 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001325 if (ret < 0)
1326 goto fail;
1327
Alex Elder0e6f3222012-07-25 09:32:40 -05001328 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001329 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330 ops[0].watch.flag = 1;
1331
Alex Elder0ce1a792012-07-03 16:01:18 -05001332 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001333 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001334 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1335 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001336 rbd_dev->header_name,
1337 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001338 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001339
1340 if (ret < 0)
1341 goto fail_event;
1342
1343 rbd_destroy_ops(ops);
1344 return 0;
1345
1346fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001347 ceph_osdc_cancel_event(rbd_dev->watch_event);
1348 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001349fail:
1350 rbd_destroy_ops(ops);
1351 return ret;
1352}
1353
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001354/*
1355 * Request sync osd unwatch
1356 */
Alex Elder070c6332012-07-25 09:32:41 -05001357static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001358{
1359 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001360 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001361
Alex Elder57cfc102012-06-26 12:57:03 -07001362 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1363 if (!ops)
1364 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001365
1366 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001367 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001368 ops[0].watch.flag = 0;
1369
Alex Elder0ce1a792012-07-03 16:01:18 -05001370 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001371 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001372 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1373 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001374 rbd_dev->header_name,
1375 0, 0, NULL, NULL, NULL);
1376
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001377
1378 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001379 ceph_osdc_cancel_event(rbd_dev->watch_event);
1380 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001381 return ret;
1382}
1383
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001384struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001385 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386};
1387
1388static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1389{
Alex Elder0ce1a792012-07-03 16:01:18 -05001390 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1391 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001392 return;
1393
Alex Elderbd919d42012-07-13 20:35:11 -05001394 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1395 rbd_dev->header_name, (unsigned long long) notify_id,
1396 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001397}
1398
1399/*
1400 * Request sync osd notify
1401 */
Alex Elder4cb16252012-07-25 09:32:40 -05001402static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001403{
1404 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001405 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001406 struct ceph_osd_event *event;
1407 struct rbd_notify_info info;
1408 int payload_len = sizeof(u32) + sizeof(u32);
1409 int ret;
1410
Alex Elder57cfc102012-06-26 12:57:03 -07001411 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1412 if (!ops)
1413 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001414
Alex Elder0ce1a792012-07-03 16:01:18 -05001415 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001416
1417 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1418 (void *)&info, &event);
1419 if (ret < 0)
1420 goto fail;
1421
1422 ops[0].watch.ver = 1;
1423 ops[0].watch.flag = 1;
1424 ops[0].watch.cookie = event->cookie;
1425 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1426 ops[0].watch.timeout = 12;
1427
Alex Elder0ce1a792012-07-03 16:01:18 -05001428 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001429 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001430 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1431 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001432 rbd_dev->header_name,
1433 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001434 if (ret < 0)
1435 goto fail_event;
1436
1437 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1438 dout("ceph_osdc_wait_event returned %d\n", ret);
1439 rbd_destroy_ops(ops);
1440 return 0;
1441
1442fail_event:
1443 ceph_osdc_cancel_event(event);
1444fail:
1445 rbd_destroy_ops(ops);
1446 return ret;
1447}
1448
1449/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001450 * Request sync osd read
1451 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001452static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001453 const char *object_name,
1454 const char *class_name,
1455 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001456 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001457 int len,
1458 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001459{
1460 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001461 int class_name_len = strlen(class_name);
1462 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001463 int ret;
1464
1465 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001466 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001467 if (!ops)
1468 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001469
Alex Elderaded07e2012-07-03 16:01:18 -05001470 ops[0].cls.class_name = class_name;
1471 ops[0].cls.class_len = (__u8) class_name_len;
1472 ops[0].cls.method_name = method_name;
1473 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474 ops[0].cls.argc = 0;
1475 ops[0].cls.indata = data;
1476 ops[0].cls.indata_len = len;
1477
Alex Elder0ce1a792012-07-03 16:01:18 -05001478 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001479 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1481 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001482 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001483
1484 rbd_destroy_ops(ops);
1485
1486 dout("cls_exec returned %d\n", ret);
1487 return ret;
1488}
1489
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001490static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1491{
1492 struct rbd_req_coll *coll =
1493 kzalloc(sizeof(struct rbd_req_coll) +
1494 sizeof(struct rbd_req_status) * num_reqs,
1495 GFP_ATOMIC);
1496
1497 if (!coll)
1498 return NULL;
1499 coll->total = num_reqs;
1500 kref_init(&coll->kref);
1501 return coll;
1502}
1503
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001504/*
1505 * block device queue callback
1506 */
1507static void rbd_rq_fn(struct request_queue *q)
1508{
1509 struct rbd_device *rbd_dev = q->queuedata;
1510 struct request *rq;
1511 struct bio_pair *bp = NULL;
1512
Alex Elder00f1f362012-02-07 12:03:36 -06001513 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001514 struct bio *bio;
1515 struct bio *rq_bio, *next_bio = NULL;
1516 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001517 unsigned int size;
1518 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001519 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001520 int num_segs, cur_seg = 0;
1521 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001522 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001523
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001524 dout("fetched request\n");
1525
1526 /* filter out block requests we don't understand */
1527 if ((rq->cmd_type != REQ_TYPE_FS)) {
1528 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001529 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 }
1531
1532 /* deduce our operation (read, write) */
1533 do_write = (rq_data_dir(rq) == WRITE);
1534
1535 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001536 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001537 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001538 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001540 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001541 }
1542
1543 spin_unlock_irq(q->queue_lock);
1544
Josh Durgind1d25642011-12-05 14:03:05 -08001545 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001546
Alex Elderf84344f2012-08-31 17:29:51 -05001547 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1548 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001549 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001550 dout("request for non-existent snapshot");
1551 spin_lock_irq(q->queue_lock);
1552 __blk_end_request_all(rq, -ENXIO);
1553 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001554 }
1555
Josh Durgind1d25642011-12-05 14:03:05 -08001556 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1557
1558 up_read(&rbd_dev->header_rwsem);
1559
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001560 dout("%s 0x%x bytes at 0x%llx\n",
1561 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001562 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001563
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001564 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001565 if (num_segs <= 0) {
1566 spin_lock_irq(q->queue_lock);
1567 __blk_end_request_all(rq, num_segs);
1568 ceph_put_snap_context(snapc);
1569 continue;
1570 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001571 coll = rbd_alloc_coll(num_segs);
1572 if (!coll) {
1573 spin_lock_irq(q->queue_lock);
1574 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001575 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001576 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001577 }
1578
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001579 do {
1580 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001581 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001582 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001583 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001584 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1585 op_size, GFP_ATOMIC);
1586 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001587 rbd_coll_end_req_index(rq, coll, cur_seg,
1588 -ENOMEM, op_size);
1589 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590 }
1591
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001592
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001593 /* init OSD command: write or read */
1594 if (do_write)
1595 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001596 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001597 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001598 op_size, bio,
1599 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001600 else
1601 rbd_req_read(rq, rbd_dev,
Alex Elderf84344f2012-08-31 17:29:51 -05001602 rbd_dev->mapping.snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001603 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001604 op_size, bio,
1605 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001606
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001607next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001608 size -= op_size;
1609 ofs += op_size;
1610
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001611 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001612 rq_bio = next_bio;
1613 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001614 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001615
1616 if (bp)
1617 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001618 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001619
1620 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001621 }
1622}
1623
1624/*
1625 * a queue callback. Makes sure that we don't create a bio that spans across
1626 * multiple osd objects. One exception would be with a single page bios,
1627 * which we handle later at bio_chain_clone
1628 */
1629static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1630 struct bio_vec *bvec)
1631{
1632 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001633 unsigned int chunk_sectors;
1634 sector_t sector;
1635 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001636 int max;
1637
Alex Elder593a9e72012-02-07 12:03:37 -06001638 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1639 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1640 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1641
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001642 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001643 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001644 if (max < 0)
1645 max = 0; /* bio_add cannot handle a negative return */
1646 if (max <= bvec->bv_len && bio_sectors == 0)
1647 return bvec->bv_len;
1648 return max;
1649}
1650
1651static void rbd_free_disk(struct rbd_device *rbd_dev)
1652{
1653 struct gendisk *disk = rbd_dev->disk;
1654
1655 if (!disk)
1656 return;
1657
1658 rbd_header_free(&rbd_dev->header);
1659
1660 if (disk->flags & GENHD_FL_UP)
1661 del_gendisk(disk);
1662 if (disk->queue)
1663 blk_cleanup_queue(disk->queue);
1664 put_disk(disk);
1665}
1666
1667/*
Alex Elder4156d992012-08-02 11:29:46 -05001668 * Read the complete header for the given rbd device.
1669 *
1670 * Returns a pointer to a dynamically-allocated buffer containing
1671 * the complete and validated header. Caller can pass the address
1672 * of a variable that will be filled in with the version of the
1673 * header object at the time it was read.
1674 *
1675 * Returns a pointer-coded errno if a failure occurs.
1676 */
1677static struct rbd_image_header_ondisk *
1678rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1679{
1680 struct rbd_image_header_ondisk *ondisk = NULL;
1681 u32 snap_count = 0;
1682 u64 names_size = 0;
1683 u32 want_count;
1684 int ret;
1685
1686 /*
1687 * The complete header will include an array of its 64-bit
1688 * snapshot ids, followed by the names of those snapshots as
1689 * a contiguous block of NUL-terminated strings. Note that
1690 * the number of snapshots could change by the time we read
1691 * it in, in which case we re-read it.
1692 */
1693 do {
1694 size_t size;
1695
1696 kfree(ondisk);
1697
1698 size = sizeof (*ondisk);
1699 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1700 size += names_size;
1701 ondisk = kmalloc(size, GFP_KERNEL);
1702 if (!ondisk)
1703 return ERR_PTR(-ENOMEM);
1704
1705 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1706 rbd_dev->header_name,
1707 0, size,
1708 (char *) ondisk, version);
1709
1710 if (ret < 0)
1711 goto out_err;
1712 if (WARN_ON((size_t) ret < size)) {
1713 ret = -ENXIO;
1714 pr_warning("short header read for image %s"
1715 " (want %zd got %d)\n",
1716 rbd_dev->image_name, size, ret);
1717 goto out_err;
1718 }
1719 if (!rbd_dev_ondisk_valid(ondisk)) {
1720 ret = -ENXIO;
1721 pr_warning("invalid header for image %s\n",
1722 rbd_dev->image_name);
1723 goto out_err;
1724 }
1725
1726 names_size = le64_to_cpu(ondisk->snap_names_len);
1727 want_count = snap_count;
1728 snap_count = le32_to_cpu(ondisk->snap_count);
1729 } while (snap_count != want_count);
1730
1731 return ondisk;
1732
1733out_err:
1734 kfree(ondisk);
1735
1736 return ERR_PTR(ret);
1737}
1738
1739/*
1740 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001741 */
1742static int rbd_read_header(struct rbd_device *rbd_dev,
1743 struct rbd_image_header *header)
1744{
Alex Elder4156d992012-08-02 11:29:46 -05001745 struct rbd_image_header_ondisk *ondisk;
1746 u64 ver = 0;
1747 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001748
Alex Elder4156d992012-08-02 11:29:46 -05001749 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1750 if (IS_ERR(ondisk))
1751 return PTR_ERR(ondisk);
1752 ret = rbd_header_from_disk(header, ondisk);
1753 if (ret >= 0)
1754 header->obj_version = ver;
1755 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001756
Alex Elder4156d992012-08-02 11:29:46 -05001757 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001758}
1759
1760/*
1761 * create a snapshot
1762 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001763static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001764 const char *snap_name,
1765 gfp_t gfp_flags)
1766{
1767 int name_len = strlen(snap_name);
1768 u64 new_snapid;
1769 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001770 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001771 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001772
1773 /* we should create a snapshot only if we're pointing at the head */
Alex Elderf84344f2012-08-31 17:29:51 -05001774 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775 return -EINVAL;
1776
Alex Elder0ce1a792012-07-03 16:01:18 -05001777 monc = &rbd_dev->rbd_client->client->monc;
1778 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001779 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780 if (ret < 0)
1781 return ret;
1782
1783 data = kmalloc(name_len + 16, gfp_flags);
1784 if (!data)
1785 return -ENOMEM;
1786
Sage Weil916d4d62011-05-12 16:10:50 -07001787 p = data;
1788 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001789
Sage Weil916d4d62011-05-12 16:10:50 -07001790 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1791 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001792
Alex Elder0bed54d2012-07-03 16:01:18 -05001793 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001794 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001795 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001796
Sage Weil916d4d62011-05-12 16:10:50 -07001797 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001798
Alex Elder505cbb92012-07-19 08:49:18 -05001799 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001800bad:
1801 return -ERANGE;
1802}
1803
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001804static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1805{
1806 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001807 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001808
Alex Eldera0593292012-07-19 09:09:27 -05001809 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001810 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001811}
1812
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813/*
1814 * only read the first part of the ondisk header, without the snaps info
1815 */
Alex Elderb8136232012-07-25 09:32:41 -05001816static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001817{
1818 int ret;
1819 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001820
1821 ret = rbd_read_header(rbd_dev, &h);
1822 if (ret < 0)
1823 return ret;
1824
Josh Durgina51aa0c2011-12-05 10:35:04 -08001825 down_write(&rbd_dev->header_rwsem);
1826
Sage Weil9db4b3e2011-04-19 22:49:06 -07001827 /* resized? */
Alex Elderf84344f2012-08-31 17:29:51 -05001828 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
Josh Durgin474ef7c2011-11-21 17:13:54 -08001829 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1830
Alex Elder99c1f082012-08-30 14:42:15 -05001831 if (size != (sector_t) rbd_dev->mapping.size) {
1832 dout("setting size to %llu sectors",
1833 (unsigned long long) size);
1834 rbd_dev->mapping.size = (u64) size;
1835 set_capacity(rbd_dev->disk, size);
1836 }
Josh Durgin474ef7c2011-11-21 17:13:54 -08001837 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001838
Alex Elder849b4262012-07-09 21:04:24 -05001839 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001841 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001842 /* osd requests may still refer to snapc */
1843 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001844
Alex Elderb8136232012-07-25 09:32:41 -05001845 if (hver)
1846 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001847 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001848 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001849 rbd_dev->header.snapc = h.snapc;
1850 rbd_dev->header.snap_names = h.snap_names;
1851 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001852 /* Free the extra copy of the object prefix */
1853 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1854 kfree(h.object_prefix);
1855
Alex Elder9fcbb802012-08-23 23:48:49 -05001856 ret = rbd_dev_snap_devs_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001857
Josh Durginc6666012011-11-21 17:11:12 -08001858 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001859
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001860 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001861}
1862
Alex Elder1fe5e992012-07-25 09:32:41 -05001863static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1864{
1865 int ret;
1866
1867 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1868 ret = __rbd_refresh_header(rbd_dev, hver);
1869 mutex_unlock(&ctl_mutex);
1870
1871 return ret;
1872}
1873
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001874static int rbd_init_disk(struct rbd_device *rbd_dev)
1875{
1876 struct gendisk *disk;
1877 struct request_queue *q;
1878 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001879 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001880
1881 /* contact OSD, request size info about the object being mapped */
1882 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1883 if (rc)
1884 return rc;
1885
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001886 /* no need to lock here, as rbd_dev is not registered yet */
Alex Elder9fcbb802012-08-23 23:48:49 -05001887 rc = rbd_dev_snap_devs_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001888 if (rc)
1889 return rc;
1890
Alex Elder4e1105a2012-08-31 17:29:52 -05001891 rc = rbd_header_set_snap(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001892 if (rc)
1893 return rc;
1894
1895 /* create gendisk info */
1896 rc = -ENOMEM;
1897 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1898 if (!disk)
1899 goto out;
1900
Alex Elderf0f8cef2012-01-29 13:57:44 -06001901 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001902 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001903 disk->major = rbd_dev->major;
1904 disk->first_minor = 0;
1905 disk->fops = &rbd_bd_ops;
1906 disk->private_data = rbd_dev;
1907
1908 /* init rq */
1909 rc = -ENOMEM;
1910 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1911 if (!q)
1912 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001913
Alex Elder593a9e72012-02-07 12:03:37 -06001914 /* We use the default size, but let's be explicit about it. */
1915 blk_queue_physical_block_size(q, SECTOR_SIZE);
1916
Josh Durgin029bcbd2011-07-22 11:35:23 -07001917 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001918 segment_size = rbd_obj_bytes(&rbd_dev->header);
1919 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1920 blk_queue_max_segment_size(q, segment_size);
1921 blk_queue_io_min(q, segment_size);
1922 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001923
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001924 blk_queue_merge_bvec(q, rbd_merge_bvec);
1925 disk->queue = q;
1926
1927 q->queuedata = rbd_dev;
1928
1929 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001930
1931 /* finally, announce the disk to the world */
Alex Elder99c1f082012-08-30 14:42:15 -05001932 set_capacity(disk, (sector_t) rbd_dev->mapping.size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001933 add_disk(disk);
1934
1935 pr_info("%s: added with size 0x%llx\n",
Alex Elder99c1f082012-08-30 14:42:15 -05001936 disk->disk_name, (unsigned long long) rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001937 return 0;
1938
1939out_disk:
1940 put_disk(disk);
1941out:
1942 return rc;
1943}
1944
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001945/*
1946 sysfs
1947*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001948
Alex Elder593a9e72012-02-07 12:03:37 -06001949static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1950{
1951 return container_of(dev, struct rbd_device, dev);
1952}
1953
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001954static ssize_t rbd_size_show(struct device *dev,
1955 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001956{
Alex Elder593a9e72012-02-07 12:03:37 -06001957 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001958 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001959
Josh Durgina51aa0c2011-12-05 10:35:04 -08001960 down_read(&rbd_dev->header_rwsem);
1961 size = get_capacity(rbd_dev->disk);
1962 up_read(&rbd_dev->header_rwsem);
1963
1964 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001965}
1966
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001967static ssize_t rbd_major_show(struct device *dev,
1968 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001969{
Alex Elder593a9e72012-02-07 12:03:37 -06001970 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001971
1972 return sprintf(buf, "%d\n", rbd_dev->major);
1973}
1974
1975static ssize_t rbd_client_id_show(struct device *dev,
1976 struct device_attribute *attr, char *buf)
1977{
Alex Elder593a9e72012-02-07 12:03:37 -06001978 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001979
Alex Elder1dbb4392012-01-24 10:08:37 -06001980 return sprintf(buf, "client%lld\n",
1981 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001982}
1983
1984static ssize_t rbd_pool_show(struct device *dev,
1985 struct device_attribute *attr, char *buf)
1986{
Alex Elder593a9e72012-02-07 12:03:37 -06001987 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001988
1989 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1990}
1991
Alex Elder9bb2f332012-07-12 10:46:35 -05001992static ssize_t rbd_pool_id_show(struct device *dev,
1993 struct device_attribute *attr, char *buf)
1994{
1995 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1996
1997 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1998}
1999
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002000static ssize_t rbd_name_show(struct device *dev,
2001 struct device_attribute *attr, char *buf)
2002{
Alex Elder593a9e72012-02-07 12:03:37 -06002003 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002004
Alex Elder0bed54d2012-07-03 16:01:18 -05002005 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002006}
2007
2008static ssize_t rbd_snap_show(struct device *dev,
2009 struct device_attribute *attr,
2010 char *buf)
2011{
Alex Elder593a9e72012-02-07 12:03:37 -06002012 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002013
Alex Elderf84344f2012-08-31 17:29:51 -05002014 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002015}
2016
2017static ssize_t rbd_image_refresh(struct device *dev,
2018 struct device_attribute *attr,
2019 const char *buf,
2020 size_t size)
2021{
Alex Elder593a9e72012-02-07 12:03:37 -06002022 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002023 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002024
Alex Elder1fe5e992012-07-25 09:32:41 -05002025 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002026
2027 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002028}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002029
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002030static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2031static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2032static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2033static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002034static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002035static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2036static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2037static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2038static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002039
2040static struct attribute *rbd_attrs[] = {
2041 &dev_attr_size.attr,
2042 &dev_attr_major.attr,
2043 &dev_attr_client_id.attr,
2044 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002045 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002046 &dev_attr_name.attr,
2047 &dev_attr_current_snap.attr,
2048 &dev_attr_refresh.attr,
2049 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002050 NULL
2051};
2052
2053static struct attribute_group rbd_attr_group = {
2054 .attrs = rbd_attrs,
2055};
2056
2057static const struct attribute_group *rbd_attr_groups[] = {
2058 &rbd_attr_group,
2059 NULL
2060};
2061
2062static void rbd_sysfs_dev_release(struct device *dev)
2063{
2064}
2065
2066static struct device_type rbd_device_type = {
2067 .name = "rbd",
2068 .groups = rbd_attr_groups,
2069 .release = rbd_sysfs_dev_release,
2070};
2071
2072
2073/*
2074 sysfs - snapshots
2075*/
2076
2077static ssize_t rbd_snap_size_show(struct device *dev,
2078 struct device_attribute *attr,
2079 char *buf)
2080{
2081 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2082
Josh Durgin35915382011-12-05 18:25:13 -08002083 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002084}
2085
2086static ssize_t rbd_snap_id_show(struct device *dev,
2087 struct device_attribute *attr,
2088 char *buf)
2089{
2090 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2091
Josh Durgin35915382011-12-05 18:25:13 -08002092 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002093}
2094
2095static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2096static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2097
2098static struct attribute *rbd_snap_attrs[] = {
2099 &dev_attr_snap_size.attr,
2100 &dev_attr_snap_id.attr,
2101 NULL,
2102};
2103
2104static struct attribute_group rbd_snap_attr_group = {
2105 .attrs = rbd_snap_attrs,
2106};
2107
2108static void rbd_snap_dev_release(struct device *dev)
2109{
2110 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2111 kfree(snap->name);
2112 kfree(snap);
2113}
2114
2115static const struct attribute_group *rbd_snap_attr_groups[] = {
2116 &rbd_snap_attr_group,
2117 NULL
2118};
2119
2120static struct device_type rbd_snap_device_type = {
2121 .groups = rbd_snap_attr_groups,
2122 .release = rbd_snap_dev_release,
2123};
2124
Alex Elder14e70852012-07-19 09:09:27 -05002125static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002126{
2127 list_del(&snap->node);
2128 device_unregister(&snap->dev);
2129}
2130
Alex Elder14e70852012-07-19 09:09:27 -05002131static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002132 struct device *parent)
2133{
2134 struct device *dev = &snap->dev;
2135 int ret;
2136
2137 dev->type = &rbd_snap_device_type;
2138 dev->parent = parent;
2139 dev->release = rbd_snap_dev_release;
2140 dev_set_name(dev, "snap_%s", snap->name);
2141 ret = device_register(dev);
2142
2143 return ret;
2144}
2145
Alex Elder4e891e02012-07-10 20:30:10 -05002146static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2147 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002148{
Alex Elder4e891e02012-07-10 20:30:10 -05002149 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002150 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002151
2152 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002153 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002154 return ERR_PTR(-ENOMEM);
2155
2156 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002157 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002158 if (!snap->name)
2159 goto err;
2160
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002161 snap->size = rbd_dev->header.snap_sizes[i];
2162 snap->id = rbd_dev->header.snapc->snaps[i];
2163 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002164 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002165 if (ret < 0)
2166 goto err;
2167 }
Alex Elder4e891e02012-07-10 20:30:10 -05002168
2169 return snap;
2170
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002171err:
2172 kfree(snap->name);
2173 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002174
2175 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002176}
2177
2178/*
Alex Elder35938152012-08-02 11:29:46 -05002179 * Scan the rbd device's current snapshot list and compare it to the
2180 * newly-received snapshot context. Remove any existing snapshots
2181 * not present in the new snapshot context. Add a new snapshot for
2182 * any snaphots in the snapshot context not in the current list.
2183 * And verify there are no changes to snapshots we already know
2184 * about.
2185 *
2186 * Assumes the snapshots in the snapshot context are sorted by
2187 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2188 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002189 */
Alex Elder9fcbb802012-08-23 23:48:49 -05002190static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002191{
Alex Elder35938152012-08-02 11:29:46 -05002192 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2193 const u32 snap_count = snapc->num_snaps;
2194 char *snap_name = rbd_dev->header.snap_names;
2195 struct list_head *head = &rbd_dev->snaps;
2196 struct list_head *links = head->next;
2197 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002198
Alex Elder9fcbb802012-08-23 23:48:49 -05002199 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002200 while (index < snap_count || links != head) {
2201 u64 snap_id;
2202 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002203
Alex Elder35938152012-08-02 11:29:46 -05002204 snap_id = index < snap_count ? snapc->snaps[index]
2205 : CEPH_NOSNAP;
2206 snap = links != head ? list_entry(links, struct rbd_snap, node)
2207 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05002208 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002209
Alex Elder35938152012-08-02 11:29:46 -05002210 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2211 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002212
Alex Elder35938152012-08-02 11:29:46 -05002213 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002214
Alex Elderf84344f2012-08-31 17:29:51 -05002215 if (rbd_dev->mapping.snap_id == snap->id)
2216 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002217 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002218 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002219 rbd_dev->mapping.snap_id == snap->id ?
2220 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002221 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002222
Alex Elder35938152012-08-02 11:29:46 -05002223 /* Done with this list entry; advance */
2224
2225 links = next;
2226 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002227 }
Alex Elder35938152012-08-02 11:29:46 -05002228
Alex Elder9fcbb802012-08-23 23:48:49 -05002229 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2230 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002231 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2232 struct rbd_snap *new_snap;
2233
2234 /* We haven't seen this snapshot before */
2235
2236 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2237 snap_name);
Alex Elder9fcbb802012-08-23 23:48:49 -05002238 if (IS_ERR(new_snap)) {
2239 int err = PTR_ERR(new_snap);
2240
2241 dout(" failed to add dev, error %d\n", err);
2242
2243 return err;
2244 }
Alex Elder35938152012-08-02 11:29:46 -05002245
2246 /* New goes before existing, or at end of list */
2247
Alex Elder9fcbb802012-08-23 23:48:49 -05002248 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002249 if (snap)
2250 list_add_tail(&new_snap->node, &snap->node);
2251 else
Alex Elder523f3252012-08-30 00:16:37 -05002252 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002253 } else {
2254 /* Already have this one */
2255
Alex Elder9fcbb802012-08-23 23:48:49 -05002256 dout(" already present\n");
2257
Alex Elderaafb230e2012-09-06 16:00:54 -05002258 rbd_assert(snap->size ==
2259 rbd_dev->header.snap_sizes[index]);
2260 rbd_assert(!strcmp(snap->name, snap_name));
Alex Elder35938152012-08-02 11:29:46 -05002261
2262 /* Done with this list entry; advance */
2263
2264 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002265 }
Alex Elder35938152012-08-02 11:29:46 -05002266
2267 /* Advance to the next entry in the snapshot context */
2268
2269 index++;
2270 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002271 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002272 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002273
2274 return 0;
2275}
2276
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002277static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2278{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002279 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002280 struct device *dev;
2281 struct rbd_snap *snap;
2282
2283 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2284 dev = &rbd_dev->dev;
2285
2286 dev->bus = &rbd_bus_type;
2287 dev->type = &rbd_device_type;
2288 dev->parent = &rbd_root_dev;
2289 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002290 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002291 ret = device_register(dev);
2292 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002293 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002294
2295 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002296 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002297 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002298 break;
2299 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002300out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002301 mutex_unlock(&ctl_mutex);
2302 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002303}
2304
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002305static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2306{
2307 device_unregister(&rbd_dev->dev);
2308}
2309
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002310static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2311{
2312 int ret, rc;
2313
2314 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002315 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002316 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002317 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002318 if (rc < 0)
2319 return rc;
2320 }
2321 } while (ret == -ERANGE);
2322
2323 return ret;
2324}
2325
Alex Eldere2839302012-08-29 17:11:06 -05002326static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002327
2328/*
Alex Elder499afd52012-02-02 08:13:29 -06002329 * Get a unique rbd identifier for the given new rbd_dev, and add
2330 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002331 */
Alex Eldere2839302012-08-29 17:11:06 -05002332static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002333{
Alex Eldere2839302012-08-29 17:11:06 -05002334 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002335
2336 spin_lock(&rbd_dev_list_lock);
2337 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2338 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002339 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2340 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002341}
Alex Elderb7f23c32012-01-29 13:57:43 -06002342
Alex Elder1ddbe942012-01-29 13:57:44 -06002343/*
Alex Elder499afd52012-02-02 08:13:29 -06002344 * Remove an rbd_dev from the global list, and record that its
2345 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002346 */
Alex Eldere2839302012-08-29 17:11:06 -05002347static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002348{
Alex Elderd184f6b2012-01-29 13:57:44 -06002349 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002350 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002351 int max_id;
2352
Alex Elderaafb230e2012-09-06 16:00:54 -05002353 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002354
Alex Eldere2839302012-08-29 17:11:06 -05002355 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2356 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002357 spin_lock(&rbd_dev_list_lock);
2358 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002359
2360 /*
2361 * If the id being "put" is not the current maximum, there
2362 * is nothing special we need to do.
2363 */
Alex Eldere2839302012-08-29 17:11:06 -05002364 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002365 spin_unlock(&rbd_dev_list_lock);
2366 return;
2367 }
2368
2369 /*
2370 * We need to update the current maximum id. Search the
2371 * list to find out what it is. We're more likely to find
2372 * the maximum at the end, so search the list backward.
2373 */
2374 max_id = 0;
2375 list_for_each_prev(tmp, &rbd_dev_list) {
2376 struct rbd_device *rbd_dev;
2377
2378 rbd_dev = list_entry(tmp, struct rbd_device, node);
2379 if (rbd_id > max_id)
2380 max_id = rbd_id;
2381 }
Alex Elder499afd52012-02-02 08:13:29 -06002382 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002383
Alex Elder1ddbe942012-01-29 13:57:44 -06002384 /*
Alex Eldere2839302012-08-29 17:11:06 -05002385 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002386 * which case it now accurately reflects the new maximum.
2387 * Be careful not to overwrite the maximum value in that
2388 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002389 */
Alex Eldere2839302012-08-29 17:11:06 -05002390 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2391 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002392}
2393
Alex Eldera725f65e2012-02-02 08:13:30 -06002394/*
Alex Eldere28fff262012-02-02 08:13:30 -06002395 * Skips over white space at *buf, and updates *buf to point to the
2396 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002397 * the token (string of non-white space characters) found. Note
2398 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002399 */
2400static inline size_t next_token(const char **buf)
2401{
2402 /*
2403 * These are the characters that produce nonzero for
2404 * isspace() in the "C" and "POSIX" locales.
2405 */
2406 const char *spaces = " \f\n\r\t\v";
2407
2408 *buf += strspn(*buf, spaces); /* Find start of token */
2409
2410 return strcspn(*buf, spaces); /* Return token length */
2411}
2412
2413/*
2414 * Finds the next token in *buf, and if the provided token buffer is
2415 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002416 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2417 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002418 *
2419 * Returns the length of the token found (not including the '\0').
2420 * Return value will be 0 if no token is found, and it will be >=
2421 * token_size if the token would not fit.
2422 *
Alex Elder593a9e72012-02-07 12:03:37 -06002423 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002424 * found token. Note that this occurs even if the token buffer is
2425 * too small to hold it.
2426 */
2427static inline size_t copy_token(const char **buf,
2428 char *token,
2429 size_t token_size)
2430{
2431 size_t len;
2432
2433 len = next_token(buf);
2434 if (len < token_size) {
2435 memcpy(token, *buf, len);
2436 *(token + len) = '\0';
2437 }
2438 *buf += len;
2439
2440 return len;
2441}
2442
2443/*
Alex Elderea3352f2012-07-09 21:04:23 -05002444 * Finds the next token in *buf, dynamically allocates a buffer big
2445 * enough to hold a copy of it, and copies the token into the new
2446 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2447 * that a duplicate buffer is created even for a zero-length token.
2448 *
2449 * Returns a pointer to the newly-allocated duplicate, or a null
2450 * pointer if memory for the duplicate was not available. If
2451 * the lenp argument is a non-null pointer, the length of the token
2452 * (not including the '\0') is returned in *lenp.
2453 *
2454 * If successful, the *buf pointer will be updated to point beyond
2455 * the end of the found token.
2456 *
2457 * Note: uses GFP_KERNEL for allocation.
2458 */
2459static inline char *dup_token(const char **buf, size_t *lenp)
2460{
2461 char *dup;
2462 size_t len;
2463
2464 len = next_token(buf);
2465 dup = kmalloc(len + 1, GFP_KERNEL);
2466 if (!dup)
2467 return NULL;
2468
2469 memcpy(dup, *buf, len);
2470 *(dup + len) = '\0';
2471 *buf += len;
2472
2473 if (lenp)
2474 *lenp = len;
2475
2476 return dup;
2477}
2478
2479/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002480 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2481 * rbd_md_name, and name fields of the given rbd_dev, based on the
2482 * list of monitor addresses and other options provided via
2483 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2484 * copy of the snapshot name to map if successful, or a
2485 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002486 *
2487 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002488 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002489static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2490 const char *buf,
2491 const char **mon_addrs,
2492 size_t *mon_addrs_size,
2493 char *options,
2494 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002495{
Alex Elderd22f76e2012-07-12 10:46:35 -05002496 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002497 char *err_ptr = ERR_PTR(-EINVAL);
2498 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002499
2500 /* The first four tokens are required */
2501
Alex Elder7ef32142012-02-02 08:13:30 -06002502 len = next_token(&buf);
2503 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002504 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002505 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002506 *mon_addrs = buf;
2507
2508 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002509
Alex Eldere28fff262012-02-02 08:13:30 -06002510 len = copy_token(&buf, options, options_size);
2511 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002512 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002513
Alex Elder3feeb8942012-08-31 17:29:52 -05002514 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002515 rbd_dev->pool_name = dup_token(&buf, NULL);
2516 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002517 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002518
Alex Elder0bed54d2012-07-03 16:01:18 -05002519 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2520 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002521 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002522
Alex Eldercb8627c2012-07-09 21:04:23 -05002523 /* Create the name of the header object */
2524
Alex Elder0bed54d2012-07-03 16:01:18 -05002525 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002526 + sizeof (RBD_SUFFIX),
2527 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002528 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002529 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002530 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002531
Alex Elder3feeb8942012-08-31 17:29:52 -05002532 /* Snapshot name is optional */
2533 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002534 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002535 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2536 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002537 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002538 snap_name = kmalloc(len + 1, GFP_KERNEL);
2539 if (!snap_name)
2540 goto out_err;
2541 memcpy(snap_name, buf, len);
2542 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002543
Alex Elder3feeb8942012-08-31 17:29:52 -05002544dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2545
2546 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002547
2548out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002549 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002550 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002551 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002552 rbd_dev->image_name = NULL;
2553 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002554 kfree(rbd_dev->pool_name);
2555 rbd_dev->pool_name = NULL;
2556
Alex Elder3feeb8942012-08-31 17:29:52 -05002557 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002558}
2559
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002560static ssize_t rbd_add(struct bus_type *bus,
2561 const char *buf,
2562 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002563{
Alex Eldercb8627c2012-07-09 21:04:23 -05002564 char *options;
2565 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002566 const char *mon_addrs = NULL;
2567 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002568 struct ceph_osd_client *osdc;
2569 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05002570 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002571
2572 if (!try_module_get(THIS_MODULE))
2573 return -ENODEV;
2574
Alex Elder27cc2592012-02-02 08:13:30 -06002575 options = kmalloc(count, GFP_KERNEL);
2576 if (!options)
2577 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002578 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2579 if (!rbd_dev)
2580 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002581
2582 /* static rbd_device initialization */
2583 spin_lock_init(&rbd_dev->lock);
2584 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002585 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002586 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002587
Alex Elderd184f6b2012-01-29 13:57:44 -06002588 /* generate unique id: find highest unique id, add one */
Alex Eldere2839302012-08-29 17:11:06 -05002589 rbd_dev_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002590
Alex Eldera725f65e2012-02-02 08:13:30 -06002591 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002592 BUILD_BUG_ON(DEV_NAME_LEN
2593 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002594 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a822012-01-29 13:57:44 -06002595
Alex Eldera725f65e2012-02-02 08:13:30 -06002596 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05002597 snap_name = rbd_add_parse_args(rbd_dev, buf,
2598 &mon_addrs, &mon_addrs_size, options, count);
2599 if (IS_ERR(snap_name)) {
2600 rc = PTR_ERR(snap_name);
Alex Eldera725f65e2012-02-02 08:13:30 -06002601 goto err_put_id;
Alex Elder3feeb8942012-08-31 17:29:52 -05002602 }
Alex Eldera725f65e2012-02-02 08:13:30 -06002603
Alex Elderf8c38922012-08-10 13:12:07 -07002604 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2605 if (rc < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002606 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002607
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002608 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002609 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002610 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2611 if (rc < 0)
2612 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002613 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002614
2615 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002616 rc = register_blkdev(0, rbd_dev->name);
2617 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002618 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002619 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002620
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002621 rc = rbd_bus_add_dev(rbd_dev);
2622 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002623 goto err_out_blkdev;
2624
Alex Elder32eec682012-02-08 16:11:14 -06002625 /*
2626 * At this point cleanup in the event of an error is the job
2627 * of the sysfs code (initiated by rbd_bus_del_dev()).
2628 *
2629 * Set up and announce blkdev mapping.
2630 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002631 rc = rbd_init_disk(rbd_dev);
2632 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002633 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002634
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002635 rc = rbd_init_watch_dev(rbd_dev);
2636 if (rc)
2637 goto err_out_bus;
2638
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002639 return count;
2640
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002641err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002642 /* this will also clean up rest of rbd_dev stuff */
2643
2644 rbd_bus_del_dev(rbd_dev);
2645 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002646 return rc;
2647
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002648err_out_blkdev:
2649 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2650err_out_client:
2651 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002652err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002653 if (rbd_dev->pool_name) {
Alex Elderf84344f2012-08-31 17:29:51 -05002654 kfree(rbd_dev->mapping.snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002655 kfree(rbd_dev->header_name);
2656 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002657 kfree(rbd_dev->pool_name);
2658 }
Alex Eldere2839302012-08-29 17:11:06 -05002659 rbd_dev_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002660err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002661 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002662 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002663
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002664 dout("Error adding device %s\n", buf);
2665 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002666
2667 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002668}
2669
Alex Elderde71a292012-07-03 16:01:19 -05002670static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002671{
2672 struct list_head *tmp;
2673 struct rbd_device *rbd_dev;
2674
Alex Eldere124a822012-01-29 13:57:44 -06002675 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002676 list_for_each(tmp, &rbd_dev_list) {
2677 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002678 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002679 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002680 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002681 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002682 }
Alex Eldere124a822012-01-29 13:57:44 -06002683 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002684 return NULL;
2685}
2686
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002687static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002688{
Alex Elder593a9e72012-02-07 12:03:37 -06002689 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002690
Alex Elder1dbb4392012-01-24 10:08:37 -06002691 if (rbd_dev->watch_request) {
2692 struct ceph_client *client = rbd_dev->rbd_client->client;
2693
2694 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002695 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002696 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002697 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002698 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002699
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002700 rbd_put_client(rbd_dev);
2701
2702 /* clean up and free blkdev */
2703 rbd_free_disk(rbd_dev);
2704 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002705
2706 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05002707 kfree(rbd_dev->mapping.snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002708 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002709 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002710 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05002711 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002712 kfree(rbd_dev);
2713
2714 /* release module ref */
2715 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002716}
2717
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002718static ssize_t rbd_remove(struct bus_type *bus,
2719 const char *buf,
2720 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002721{
2722 struct rbd_device *rbd_dev = NULL;
2723 int target_id, rc;
2724 unsigned long ul;
2725 int ret = count;
2726
2727 rc = strict_strtoul(buf, 10, &ul);
2728 if (rc)
2729 return rc;
2730
2731 /* convert to int; abort if we lost anything in the conversion */
2732 target_id = (int) ul;
2733 if (target_id != ul)
2734 return -EINVAL;
2735
2736 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2737
2738 rbd_dev = __rbd_get_dev(target_id);
2739 if (!rbd_dev) {
2740 ret = -ENOENT;
2741 goto done;
2742 }
2743
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002744 __rbd_remove_all_snaps(rbd_dev);
2745 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002746
2747done:
2748 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05002749
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002750 return ret;
2751}
2752
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002753static ssize_t rbd_snap_add(struct device *dev,
2754 struct device_attribute *attr,
2755 const char *buf,
2756 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002757{
Alex Elder593a9e72012-02-07 12:03:37 -06002758 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002759 int ret;
2760 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002761 if (!name)
2762 return -ENOMEM;
2763
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002764 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002765
2766 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2767
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002768 ret = rbd_header_add_snap(rbd_dev,
2769 name, GFP_KERNEL);
2770 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002771 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002772
Alex Elderb8136232012-07-25 09:32:41 -05002773 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002774 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002775 goto err_unlock;
2776
2777 /* shouldn't hold ctl_mutex when notifying.. notify might
2778 trigger a watch callback that would need to get that mutex */
2779 mutex_unlock(&ctl_mutex);
2780
2781 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002782 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002783
2784 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002785 kfree(name);
2786 return ret;
2787
2788err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002789 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002790 kfree(name);
2791 return ret;
2792}
2793
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002794/*
2795 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002796 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002797 */
2798static int rbd_sysfs_init(void)
2799{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002800 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002801
Alex Elderfed4c142012-02-07 12:03:36 -06002802 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002803 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002804 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002805
Alex Elderfed4c142012-02-07 12:03:36 -06002806 ret = bus_register(&rbd_bus_type);
2807 if (ret < 0)
2808 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002809
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002810 return ret;
2811}
2812
2813static void rbd_sysfs_cleanup(void)
2814{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002815 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002816 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002817}
2818
2819int __init rbd_init(void)
2820{
2821 int rc;
2822
2823 rc = rbd_sysfs_init();
2824 if (rc)
2825 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002826 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002827 return 0;
2828}
2829
2830void __exit rbd_exit(void)
2831{
2832 rbd_sysfs_cleanup();
2833}
2834
2835module_init(rbd_init);
2836module_exit(rbd_exit);
2837
2838MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2839MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2840MODULE_DESCRIPTION("rados block device");
2841
2842/* following authorship retained from original osdblk.c */
2843MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2844
2845MODULE_LICENSE("GPL");