blob: 48901b51f648b28a81818d0426c5038d26e07f36 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder81a89792012-02-02 08:13:30 -060069/*
70 * An RBD device name will be "rbd#", where the "rbd" comes from
71 * RBD_DRV_NAME above, and # is a unique integer identifier.
72 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
73 * enough to hold all possible device names.
74 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070075#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060076#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070077
Alex Eldercc0538b2012-08-10 13:12:07 -070078#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070079
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080/*
81 * block device image metadata (in-memory version)
82 */
83struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050084 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050085 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070086 __u8 obj_order;
87 __u8 crypt_type;
88 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070089
Alex Elderf84344f2012-08-31 17:29:51 -050090 /* The remaining fields need to be updated occasionally */
91 u64 image_size;
92 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093 char *snap_names;
94 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070095
96 u64 obj_version;
97};
98
99struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700100 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101};
102
103/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600104 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 */
106struct rbd_client {
107 struct ceph_client *client;
108 struct kref kref;
109 struct list_head node;
110};
111
112/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600113 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700114 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700115struct rbd_req_status {
116 int done;
117 int rc;
118 u64 bytes;
119};
120
121/*
122 * a collection of requests
123 */
124struct rbd_req_coll {
125 int total;
126 int num_done;
127 struct kref kref;
128 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700129};
130
Alex Elderf0f8cef2012-01-29 13:57:44 -0600131/*
132 * a single io request
133 */
134struct rbd_request {
135 struct request *rq; /* blk layer request */
136 struct bio *bio; /* cloned bio */
137 struct page **pages; /* list of used pages */
138 u64 len;
139 int coll_index;
140 struct rbd_req_coll *coll;
141};
142
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800143struct rbd_snap {
144 struct device dev;
145 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800146 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800147 struct list_head node;
148 u64 id;
149};
150
Alex Elderf84344f2012-08-31 17:29:51 -0500151struct rbd_mapping {
152 char *snap_name;
153 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500154 u64 size;
Alex Elderf84344f2012-08-31 17:29:51 -0500155 bool snap_exists;
156 bool read_only;
157};
158
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700159/*
160 * a single device
161 */
162struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500163 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700164
165 int major; /* blkdev assigned major */
166 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700167
Alex Elderf8c38922012-08-10 13:12:07 -0700168 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169 struct rbd_client *rbd_client;
170
171 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
172
173 spinlock_t lock; /* queue lock */
174
175 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500176 char *image_name;
177 size_t image_name_len;
178 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500179 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500180 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700182 struct ceph_osd_event *watch_event;
183 struct ceph_osd_request *watch_request;
184
Josh Durginc6666012011-11-21 17:11:12 -0800185 /* protects updating the header */
186 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500187
188 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700189
190 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800191
192 /* list of snapshots */
193 struct list_head snaps;
194
195 /* sysfs related */
196 struct device dev;
197};
198
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600200
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700201static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600202static DEFINE_SPINLOCK(rbd_dev_list_lock);
203
Alex Elder432b8582012-01-29 13:57:44 -0600204static LIST_HEAD(rbd_client_list); /* clients */
205static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700206
Alex Elder9fcbb802012-08-23 23:48:49 -0500207static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800208static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800209static ssize_t rbd_snap_add(struct device *dev,
210 struct device_attribute *attr,
211 const char *buf,
212 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500213static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800214
Alex Elderf0f8cef2012-01-29 13:57:44 -0600215static ssize_t rbd_add(struct bus_type *bus, const char *buf,
216 size_t count);
217static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
218 size_t count);
219
220static struct bus_attribute rbd_bus_attrs[] = {
221 __ATTR(add, S_IWUSR, NULL, rbd_add),
222 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
223 __ATTR_NULL
224};
225
226static struct bus_type rbd_bus_type = {
227 .name = "rbd",
228 .bus_attrs = rbd_bus_attrs,
229};
230
231static void rbd_root_dev_release(struct device *dev)
232{
233}
234
235static struct device rbd_root_dev = {
236 .init_name = "rbd",
237 .release = rbd_root_dev_release,
238};
239
Alex Elderaafb230e2012-09-06 16:00:54 -0500240#ifdef RBD_DEBUG
241#define rbd_assert(expr) \
242 if (unlikely(!(expr))) { \
243 printk(KERN_ERR "\nAssertion failure in %s() " \
244 "at line %d:\n\n" \
245 "\trbd_assert(%s);\n\n", \
246 __func__, __LINE__, #expr); \
247 BUG(); \
248 }
249#else /* !RBD_DEBUG */
250# define rbd_assert(expr) ((void) 0)
251#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800252
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800253static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
254{
255 return get_device(&rbd_dev->dev);
256}
257
258static void rbd_put_dev(struct rbd_device *rbd_dev)
259{
260 put_device(&rbd_dev->dev);
261}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700262
Alex Elder1fe5e992012-07-25 09:32:41 -0500263static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700264
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700265static int rbd_open(struct block_device *bdev, fmode_t mode)
266{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600267 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700268
Alex Elderf84344f2012-08-31 17:29:51 -0500269 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270 return -EROFS;
271
Alex Elder340c7a22012-08-10 13:12:07 -0700272 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500273 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700274
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700275 return 0;
276}
277
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800278static int rbd_release(struct gendisk *disk, fmode_t mode)
279{
280 struct rbd_device *rbd_dev = disk->private_data;
281
282 rbd_put_dev(rbd_dev);
283
284 return 0;
285}
286
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700287static const struct block_device_operations rbd_bd_ops = {
288 .owner = THIS_MODULE,
289 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800290 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700291};
292
293/*
294 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500295 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296 */
Alex Elderf8c38922012-08-10 13:12:07 -0700297static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700298{
299 struct rbd_client *rbdc;
300 int ret = -ENOMEM;
301
302 dout("rbd_client_create\n");
303 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
304 if (!rbdc)
305 goto out_opt;
306
307 kref_init(&rbdc->kref);
308 INIT_LIST_HEAD(&rbdc->node);
309
Alex Elderbc534d82012-01-29 13:57:44 -0600310 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
311
Alex Elder43ae4702012-07-03 16:01:18 -0500312 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700313 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600314 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500315 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316
317 ret = ceph_open_session(rbdc->client);
318 if (ret < 0)
319 goto out_err;
320
Alex Elder432b8582012-01-29 13:57:44 -0600321 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600323 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700324
Alex Elderbc534d82012-01-29 13:57:44 -0600325 mutex_unlock(&ctl_mutex);
326
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327 dout("rbd_client_create created %p\n", rbdc);
328 return rbdc;
329
330out_err:
331 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600332out_mutex:
333 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 kfree(rbdc);
335out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500336 if (ceph_opts)
337 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400338 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339}
340
341/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700342 * Find a ceph client with specific addr and configuration. If
343 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700345static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346{
347 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700348 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349
Alex Elder43ae4702012-07-03 16:01:18 -0500350 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351 return NULL;
352
Alex Elder1f7ba332012-08-10 13:12:07 -0700353 spin_lock(&rbd_client_list_lock);
354 list_for_each_entry(client_node, &rbd_client_list, node) {
355 if (!ceph_compare_options(ceph_opts, client_node->client)) {
356 kref_get(&client_node->kref);
357 found = true;
358 break;
359 }
360 }
361 spin_unlock(&rbd_client_list_lock);
362
363 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364}
365
366/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700367 * mount options
368 */
369enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700370 Opt_last_int,
371 /* int args above */
372 Opt_last_string,
373 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700374 Opt_read_only,
375 Opt_read_write,
376 /* Boolean args above */
377 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700378};
379
Alex Elder43ae4702012-07-03 16:01:18 -0500380static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700381 /* int args above */
382 /* string args above */
Alex Elderf84344f2012-08-31 17:29:51 -0500383 {Opt_read_only, "mapping.read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700384 {Opt_read_only, "ro"}, /* Alternate spelling */
385 {Opt_read_write, "read_write"},
386 {Opt_read_write, "rw"}, /* Alternate spelling */
387 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700388 {-1, NULL}
389};
390
391static int parse_rbd_opts_token(char *c, void *private)
392{
Alex Elder43ae4702012-07-03 16:01:18 -0500393 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700394 substring_t argstr[MAX_OPT_ARGS];
395 int token, intval, ret;
396
Alex Elder43ae4702012-07-03 16:01:18 -0500397 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398 if (token < 0)
399 return -EINVAL;
400
401 if (token < Opt_last_int) {
402 ret = match_int(&argstr[0], &intval);
403 if (ret < 0) {
404 pr_err("bad mount option arg (not int) "
405 "at '%s'\n", c);
406 return ret;
407 }
408 dout("got int token %d val %d\n", token, intval);
409 } else if (token > Opt_last_int && token < Opt_last_string) {
410 dout("got string token %d val %s\n", token,
411 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700412 } else if (token > Opt_last_string && token < Opt_last_bool) {
413 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700414 } else {
415 dout("got token %d\n", token);
416 }
417
418 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700419 case Opt_read_only:
420 rbd_opts->read_only = true;
421 break;
422 case Opt_read_write:
423 rbd_opts->read_only = false;
424 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700425 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500426 rbd_assert(false);
427 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700428 }
429 return 0;
430}
431
432/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433 * Get a ceph client with specific addr and configuration, if one does
434 * not exist create it.
435 */
Alex Elderf8c38922012-08-10 13:12:07 -0700436static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
437 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438{
Alex Elderf8c38922012-08-10 13:12:07 -0700439 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500440 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700441 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700442
Alex Eldercc0538b2012-08-10 13:12:07 -0700443 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444
Alex Elder43ae4702012-07-03 16:01:18 -0500445 ceph_opts = ceph_parse_options(options, mon_addr,
446 mon_addr + mon_addr_len,
447 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700448 if (IS_ERR(ceph_opts))
449 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450
Alex Elder1f7ba332012-08-10 13:12:07 -0700451 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600453 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500454 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700455 } else {
456 rbdc = rbd_client_create(ceph_opts);
457 if (IS_ERR(rbdc))
458 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700459 }
Alex Elderf8c38922012-08-10 13:12:07 -0700460 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700461
Alex Elderf8c38922012-08-10 13:12:07 -0700462 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463}
464
465/*
466 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600467 *
Alex Elder432b8582012-01-29 13:57:44 -0600468 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 */
470static void rbd_client_release(struct kref *kref)
471{
472 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
473
474 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500475 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700476 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500477 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700478
479 ceph_destroy_client(rbdc->client);
480 kfree(rbdc);
481}
482
483/*
484 * Drop reference to ceph client node. If it's not referenced anymore, release
485 * it.
486 */
487static void rbd_put_client(struct rbd_device *rbd_dev)
488{
489 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
490 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700491}
492
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700493/*
494 * Destroy requests collection
495 */
496static void rbd_coll_release(struct kref *kref)
497{
498 struct rbd_req_coll *coll =
499 container_of(kref, struct rbd_req_coll, kref);
500
501 dout("rbd_coll_release %p\n", coll);
502 kfree(coll);
503}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700504
Alex Elder8e94af82012-07-25 09:32:40 -0500505static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
506{
Alex Elder103a1502012-08-02 11:29:45 -0500507 size_t size;
508 u32 snap_count;
509
510 /* The header has to start with the magic rbd header text */
511 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
512 return false;
513
514 /*
515 * The size of a snapshot header has to fit in a size_t, and
516 * that limits the number of snapshots.
517 */
518 snap_count = le32_to_cpu(ondisk->snap_count);
519 size = SIZE_MAX - sizeof (struct ceph_snap_context);
520 if (snap_count > size / sizeof (__le64))
521 return false;
522
523 /*
524 * Not only that, but the size of the entire the snapshot
525 * header must also be representable in a size_t.
526 */
527 size -= snap_count * sizeof (__le64);
528 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
529 return false;
530
531 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500532}
533
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700534/*
535 * Create a new header structure, translate header format from the on-disk
536 * header.
537 */
538static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500539 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540{
Alex Elderccece232012-07-10 20:30:10 -0500541 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500542 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500543 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500544 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545
Alex Elder6a523252012-07-19 17:12:59 -0500546 memset(header, 0, sizeof (*header));
547
Alex Elder103a1502012-08-02 11:29:45 -0500548 snap_count = le32_to_cpu(ondisk->snap_count);
549
Alex Elder58c17b02012-08-23 23:22:06 -0500550 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
551 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500552 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700553 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500554 memcpy(header->object_prefix, ondisk->object_prefix, len);
555 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600556
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500558 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
559
Alex Elder621901d2012-08-23 23:22:06 -0500560 /* Save a copy of the snapshot names */
561
Alex Elderf785cc12012-08-23 23:22:06 -0500562 if (snap_names_len > (u64) SIZE_MAX)
563 return -EIO;
564 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500566 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500567 /*
568 * Note that rbd_dev_v1_header_read() guarantees
569 * the ondisk buffer we're working with has
570 * snap_names_len bytes beyond the end of the
571 * snapshot id array, this memcpy() is safe.
572 */
573 memcpy(header->snap_names, &ondisk->snaps[snap_count],
574 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500575
Alex Elder621901d2012-08-23 23:22:06 -0500576 /* Record each snapshot's size */
577
Alex Elderd2bb24e2012-07-26 23:37:14 -0500578 size = snap_count * sizeof (*header->snap_sizes);
579 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700580 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500581 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500582 for (i = 0; i < snap_count; i++)
583 header->snap_sizes[i] =
584 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 } else {
Alex Elderccece232012-07-10 20:30:10 -0500586 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587 header->snap_names = NULL;
588 header->snap_sizes = NULL;
589 }
Alex Elder849b4262012-07-09 21:04:24 -0500590
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700591 header->obj_order = ondisk->options.order;
592 header->crypt_type = ondisk->options.crypt_type;
593 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500594
Alex Elder621901d2012-08-23 23:22:06 -0500595 /* Allocate and fill in the snapshot context */
596
Alex Elderf84344f2012-08-31 17:29:51 -0500597 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500598 size = sizeof (struct ceph_snap_context);
599 size += snap_count * sizeof (header->snapc->snaps[0]);
600 header->snapc = kzalloc(size, GFP_KERNEL);
601 if (!header->snapc)
602 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700603
604 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500605 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500607 for (i = 0; i < snap_count; i++)
608 header->snapc->snaps[i] =
609 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610
611 return 0;
612
Alex Elder6a523252012-07-19 17:12:59 -0500613out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500614 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500615 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500617 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500618 kfree(header->object_prefix);
619 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500620
Alex Elder00f1f362012-02-07 12:03:36 -0600621 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622}
623
Alex Elder8836b992012-08-30 14:42:15 -0500624static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700625{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626
Alex Eldere86924a2012-07-10 20:30:11 -0500627 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600628
Alex Eldere86924a2012-07-10 20:30:11 -0500629 list_for_each_entry(snap, &rbd_dev->snaps, node) {
630 if (!strcmp(snap_name, snap->name)) {
631 rbd_dev->mapping.snap_id = snap->id;
632 rbd_dev->mapping.size = snap->size;
Alex Elder00f1f362012-02-07 12:03:36 -0600633
Alex Eldere86924a2012-07-10 20:30:11 -0500634 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600635 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636 }
Alex Eldere86924a2012-07-10 20:30:11 -0500637
Alex Elder00f1f362012-02-07 12:03:36 -0600638 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639}
640
Alex Elder4e1105a2012-08-31 17:29:52 -0500641static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642{
Alex Elder78dc4472012-07-19 08:49:18 -0500643 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644
Alex Elder4e1105a2012-08-31 17:29:52 -0500645 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800646 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500647 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500648 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elderf84344f2012-08-31 17:29:51 -0500649 rbd_dev->mapping.snap_exists = false;
650 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Alex Eldere86924a2012-07-10 20:30:11 -0500651 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500653 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700654 if (ret < 0)
655 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500656 rbd_dev->mapping.snap_exists = true;
657 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500659 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700661 return ret;
662}
663
664static void rbd_header_free(struct rbd_image_header *header)
665{
Alex Elder849b4262012-07-09 21:04:24 -0500666 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500667 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500669 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500670 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500671 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800672 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500673 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674}
675
Alex Elder65ccfe22012-08-09 10:33:26 -0700676static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677{
Alex Elder65ccfe22012-08-09 10:33:26 -0700678 char *name;
679 u64 segment;
680 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681
Alex Elder65ccfe22012-08-09 10:33:26 -0700682 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
683 if (!name)
684 return NULL;
685 segment = offset >> rbd_dev->header.obj_order;
686 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
687 rbd_dev->header.object_prefix, segment);
688 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
689 pr_err("error formatting segment name for #%llu (%d)\n",
690 segment, ret);
691 kfree(name);
692 name = NULL;
693 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700694
Alex Elder65ccfe22012-08-09 10:33:26 -0700695 return name;
696}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697
Alex Elder65ccfe22012-08-09 10:33:26 -0700698static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
699{
700 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700701
Alex Elder65ccfe22012-08-09 10:33:26 -0700702 return offset & (segment_size - 1);
703}
704
705static u64 rbd_segment_length(struct rbd_device *rbd_dev,
706 u64 offset, u64 length)
707{
708 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
709
710 offset &= segment_size - 1;
711
Alex Elderaafb230e2012-09-06 16:00:54 -0500712 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700713 if (offset + length > segment_size)
714 length = segment_size - offset;
715
716 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700717}
718
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700719static int rbd_get_num_segments(struct rbd_image_header *header,
720 u64 ofs, u64 len)
721{
Alex Elderdf111be2012-08-09 10:33:26 -0700722 u64 start_seg;
723 u64 end_seg;
724
725 if (!len)
726 return 0;
727 if (len - 1 > U64_MAX - ofs)
728 return -ERANGE;
729
730 start_seg = ofs >> header->obj_order;
731 end_seg = (ofs + len - 1) >> header->obj_order;
732
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700733 return end_seg - start_seg + 1;
734}
735
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700736/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700737 * returns the size of an object in the image
738 */
739static u64 rbd_obj_bytes(struct rbd_image_header *header)
740{
741 return 1 << header->obj_order;
742}
743
744/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745 * bio helpers
746 */
747
748static void bio_chain_put(struct bio *chain)
749{
750 struct bio *tmp;
751
752 while (chain) {
753 tmp = chain;
754 chain = chain->bi_next;
755 bio_put(tmp);
756 }
757}
758
759/*
760 * zeros a bio chain, starting at specific offset
761 */
762static void zero_bio_chain(struct bio *chain, int start_ofs)
763{
764 struct bio_vec *bv;
765 unsigned long flags;
766 void *buf;
767 int i;
768 int pos = 0;
769
770 while (chain) {
771 bio_for_each_segment(bv, chain, i) {
772 if (pos + bv->bv_len > start_ofs) {
773 int remainder = max(start_ofs - pos, 0);
774 buf = bvec_kmap_irq(bv, &flags);
775 memset(buf + remainder, 0,
776 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200777 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700778 }
779 pos += bv->bv_len;
780 }
781
782 chain = chain->bi_next;
783 }
784}
785
786/*
787 * bio_chain_clone - clone a chain of bios up to a certain length.
788 * might return a bio_pair that will need to be released.
789 */
790static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
791 struct bio_pair **bp,
792 int len, gfp_t gfpmask)
793{
Alex Elder542582f2012-08-09 10:33:25 -0700794 struct bio *old_chain = *old;
795 struct bio *new_chain = NULL;
796 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700797 int total = 0;
798
799 if (*bp) {
800 bio_pair_release(*bp);
801 *bp = NULL;
802 }
803
804 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700805 struct bio *tmp;
806
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
808 if (!tmp)
809 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700810 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700811
812 if (total + old_chain->bi_size > len) {
813 struct bio_pair *bp;
814
815 /*
816 * this split can only happen with a single paged bio,
817 * split_bio will BUG_ON if this is not the case
818 */
819 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500820 "bi_size=%u\n",
821 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700822
823 /* split the bio. We'll release it either in the next
824 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600825 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700826 if (!bp)
827 goto err_out;
828
829 __bio_clone(tmp, &bp->bio1);
830
831 *next = &bp->bio2;
832 } else {
833 __bio_clone(tmp, old_chain);
834 *next = old_chain->bi_next;
835 }
836
837 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700838 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700839 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700840 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700841 else
842 new_chain = tmp;
843 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700844 old_chain = old_chain->bi_next;
845
846 total += tmp->bi_size;
847 }
848
Alex Elderaafb230e2012-09-06 16:00:54 -0500849 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700850
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700851 *old = old_chain;
852
853 return new_chain;
854
855err_out:
856 dout("bio_chain_clone with err\n");
857 bio_chain_put(new_chain);
858 return NULL;
859}
860
861/*
862 * helpers for osd request op vectors.
863 */
Alex Elder57cfc102012-06-26 12:57:03 -0700864static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
865 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700866{
Alex Elder57cfc102012-06-26 12:57:03 -0700867 struct ceph_osd_req_op *ops;
868
869 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
870 if (!ops)
871 return NULL;
872
873 ops[0].op = opcode;
874
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700875 /*
876 * op extent offset and length will be set later on
877 * in calc_raw_layout()
878 */
Alex Elder57cfc102012-06-26 12:57:03 -0700879 ops[0].payload_len = payload_len;
880
881 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882}
883
884static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
885{
886 kfree(ops);
887}
888
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700889static void rbd_coll_end_req_index(struct request *rq,
890 struct rbd_req_coll *coll,
891 int index,
892 int ret, u64 len)
893{
894 struct request_queue *q;
895 int min, max, i;
896
Alex Elderbd919d42012-07-13 20:35:11 -0500897 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
898 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700899
900 if (!rq)
901 return;
902
903 if (!coll) {
904 blk_end_request(rq, ret, len);
905 return;
906 }
907
908 q = rq->q;
909
910 spin_lock_irq(q->queue_lock);
911 coll->status[index].done = 1;
912 coll->status[index].rc = ret;
913 coll->status[index].bytes = len;
914 max = min = coll->num_done;
915 while (max < coll->total && coll->status[max].done)
916 max++;
917
918 for (i = min; i<max; i++) {
919 __blk_end_request(rq, coll->status[i].rc,
920 coll->status[i].bytes);
921 coll->num_done++;
922 kref_put(&coll->kref, rbd_coll_release);
923 }
924 spin_unlock_irq(q->queue_lock);
925}
926
927static void rbd_coll_end_req(struct rbd_request *req,
928 int ret, u64 len)
929{
930 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
931}
932
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700933/*
934 * Send ceph osd request
935 */
936static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500937 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700938 struct ceph_snap_context *snapc,
939 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500940 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700941 struct bio *bio,
942 struct page **pages,
943 int num_pages,
944 int flags,
945 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700946 struct rbd_req_coll *coll,
947 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700948 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700949 struct ceph_msg *msg),
950 struct ceph_osd_request **linger_req,
951 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700952{
953 struct ceph_osd_request *req;
954 struct ceph_file_layout *layout;
955 int ret;
956 u64 bno;
957 struct timespec mtime = CURRENT_TIME;
958 struct rbd_request *req_data;
959 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600960 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700961
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700963 if (!req_data) {
964 if (coll)
965 rbd_coll_end_req_index(rq, coll, coll_index,
966 -ENOMEM, len);
967 return -ENOMEM;
968 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700969
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700970 if (coll) {
971 req_data->coll = coll;
972 req_data->coll_index = coll_index;
973 }
974
Alex Elderbd919d42012-07-13 20:35:11 -0500975 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
976 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700977
Alex Elder0ce1a792012-07-03 16:01:18 -0500978 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600979 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
980 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700981 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700982 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700983 goto done_pages;
984 }
985
986 req->r_callback = rbd_cb;
987
988 req_data->rq = rq;
989 req_data->bio = bio;
990 req_data->pages = pages;
991 req_data->len = len;
992
993 req->r_priv = req_data;
994
995 reqhead = req->r_request->front.iov_base;
996 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
997
Alex Elderaded07e2012-07-03 16:01:18 -0500998 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999 req->r_oid_len = strlen(req->r_oid);
1000
1001 layout = &req->r_file_layout;
1002 memset(layout, 0, sizeof(*layout));
1003 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1004 layout->fl_stripe_count = cpu_to_le32(1);
1005 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001006 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001007 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1008 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001009
1010 ceph_osdc_build_request(req, ofs, &len,
1011 ops,
1012 snapc,
1013 &mtime,
1014 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001016 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001017 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001018 *linger_req = req;
1019 }
1020
Alex Elder1dbb4392012-01-24 10:08:37 -06001021 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001022 if (ret < 0)
1023 goto done_err;
1024
1025 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001026 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001027 if (ver)
1028 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001029 dout("reassert_ver=%llu\n",
1030 (unsigned long long)
1031 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001032 ceph_osdc_put_request(req);
1033 }
1034 return ret;
1035
1036done_err:
1037 bio_chain_put(req_data->bio);
1038 ceph_osdc_put_request(req);
1039done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001040 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001041 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001042 return ret;
1043}
1044
1045/*
1046 * Ceph osd op callback
1047 */
1048static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1049{
1050 struct rbd_request *req_data = req->r_priv;
1051 struct ceph_osd_reply_head *replyhead;
1052 struct ceph_osd_op *op;
1053 __s32 rc;
1054 u64 bytes;
1055 int read_op;
1056
1057 /* parse reply */
1058 replyhead = msg->front.iov_base;
1059 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1060 op = (void *)(replyhead + 1);
1061 rc = le32_to_cpu(replyhead->result);
1062 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001063 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001064
Alex Elderbd919d42012-07-13 20:35:11 -05001065 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1066 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001067
1068 if (rc == -ENOENT && read_op) {
1069 zero_bio_chain(req_data->bio, 0);
1070 rc = 0;
1071 } else if (rc == 0 && read_op && bytes < req_data->len) {
1072 zero_bio_chain(req_data->bio, bytes);
1073 bytes = req_data->len;
1074 }
1075
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001076 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001077
1078 if (req_data->bio)
1079 bio_chain_put(req_data->bio);
1080
1081 ceph_osdc_put_request(req);
1082 kfree(req_data);
1083}
1084
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001085static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1086{
1087 ceph_osdc_put_request(req);
1088}
1089
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001090/*
1091 * Do a synchronous ceph osd operation
1092 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001093static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001094 struct ceph_snap_context *snapc,
1095 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001096 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001097 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001098 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001099 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001100 char *buf,
1101 struct ceph_osd_request **linger_req,
1102 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103{
1104 int ret;
1105 struct page **pages;
1106 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001107
Alex Elderaafb230e2012-09-06 16:00:54 -05001108 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109
1110 num_pages = calc_pages_for(ofs , len);
1111 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001112 if (IS_ERR(pages))
1113 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001114
Alex Elder0ce1a792012-07-03 16:01:18 -05001115 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001116 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117 pages, num_pages,
1118 flags,
1119 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001120 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001121 NULL,
1122 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001124 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001125
1126 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1127 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1128
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129done:
1130 ceph_release_page_vector(pages, num_pages);
1131 return ret;
1132}
1133
1134/*
1135 * Do an asynchronous ceph osd operation
1136 */
1137static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001138 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001139 struct ceph_snap_context *snapc,
1140 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001141 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001142 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001143 struct bio *bio,
1144 struct rbd_req_coll *coll,
1145 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001146{
1147 char *seg_name;
1148 u64 seg_ofs;
1149 u64 seg_len;
1150 int ret;
1151 struct ceph_osd_req_op *ops;
1152 u32 payload_len;
1153
Alex Elder65ccfe22012-08-09 10:33:26 -07001154 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001155 if (!seg_name)
1156 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001157 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1158 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001159
1160 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1161
Alex Elder57cfc102012-06-26 12:57:03 -07001162 ret = -ENOMEM;
1163 ops = rbd_create_rw_ops(1, opcode, payload_len);
1164 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165 goto done;
1166
1167 /* we've taken care of segment sizes earlier when we
1168 cloned the bios. We should never have a segment
1169 truncated at this point */
Alex Elderaafb230e2012-09-06 16:00:54 -05001170 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171
1172 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1173 seg_name, seg_ofs, seg_len,
1174 bio,
1175 NULL, 0,
1176 flags,
1177 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001178 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001179 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001180
1181 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001182done:
1183 kfree(seg_name);
1184 return ret;
1185}
1186
1187/*
1188 * Request async osd write
1189 */
1190static int rbd_req_write(struct request *rq,
1191 struct rbd_device *rbd_dev,
1192 struct ceph_snap_context *snapc,
1193 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001194 struct bio *bio,
1195 struct rbd_req_coll *coll,
1196 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001197{
1198 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1199 CEPH_OSD_OP_WRITE,
1200 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001201 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001202}
1203
1204/*
1205 * Request async osd read
1206 */
1207static int rbd_req_read(struct request *rq,
1208 struct rbd_device *rbd_dev,
1209 u64 snapid,
1210 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001211 struct bio *bio,
1212 struct rbd_req_coll *coll,
1213 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001214{
1215 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001216 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001217 CEPH_OSD_OP_READ,
1218 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001219 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001220}
1221
1222/*
1223 * Request sync osd read
1224 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001225static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001227 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001228 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001229 char *buf,
1230 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231{
Alex Elder913d2fd2012-06-26 12:57:03 -07001232 struct ceph_osd_req_op *ops;
1233 int ret;
1234
1235 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1236 if (!ops)
1237 return -ENOMEM;
1238
1239 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001240 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001241 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001242 ops, object_name, ofs, len, buf, NULL, ver);
1243 rbd_destroy_ops(ops);
1244
1245 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001246}
1247
1248/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001249 * Request sync osd watch
1250 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001251static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001252 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001253 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001254{
1255 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001256 int ret;
1257
Alex Elder57cfc102012-06-26 12:57:03 -07001258 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1259 if (!ops)
1260 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001261
Josh Durgina71b8912011-12-05 18:10:44 -08001262 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001263 ops[0].watch.cookie = notify_id;
1264 ops[0].watch.flag = 0;
1265
Alex Elder0ce1a792012-07-03 16:01:18 -05001266 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001267 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001268 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269 CEPH_OSD_FLAG_READ,
1270 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001271 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001272 rbd_simple_req_cb, 0, NULL);
1273
1274 rbd_destroy_ops(ops);
1275 return ret;
1276}
1277
1278static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1279{
Alex Elder0ce1a792012-07-03 16:01:18 -05001280 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001281 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001282 int rc;
1283
Alex Elder0ce1a792012-07-03 16:01:18 -05001284 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001285 return;
1286
Alex Elderbd919d42012-07-13 20:35:11 -05001287 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1288 rbd_dev->header_name, (unsigned long long) notify_id,
1289 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001290 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001291 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001292 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001293 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001294
Alex Elder7f0a24d2012-07-25 09:32:40 -05001295 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001296}
1297
1298/*
1299 * Request sync osd watch
1300 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001301static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001302{
1303 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001304 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001305 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001306
Alex Elder57cfc102012-06-26 12:57:03 -07001307 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1308 if (!ops)
1309 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310
1311 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001312 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001313 if (ret < 0)
1314 goto fail;
1315
Alex Elder0e6f3222012-07-25 09:32:40 -05001316 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001317 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001318 ops[0].watch.flag = 1;
1319
Alex Elder0ce1a792012-07-03 16:01:18 -05001320 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001322 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1323 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001324 rbd_dev->header_name,
1325 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001326 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327
1328 if (ret < 0)
1329 goto fail_event;
1330
1331 rbd_destroy_ops(ops);
1332 return 0;
1333
1334fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001335 ceph_osdc_cancel_event(rbd_dev->watch_event);
1336 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001337fail:
1338 rbd_destroy_ops(ops);
1339 return ret;
1340}
1341
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001342/*
1343 * Request sync osd unwatch
1344 */
Alex Elder070c6332012-07-25 09:32:41 -05001345static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001346{
1347 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001348 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001349
Alex Elder57cfc102012-06-26 12:57:03 -07001350 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1351 if (!ops)
1352 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001353
1354 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001355 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001356 ops[0].watch.flag = 0;
1357
Alex Elder0ce1a792012-07-03 16:01:18 -05001358 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001359 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001360 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1361 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001362 rbd_dev->header_name,
1363 0, 0, NULL, NULL, NULL);
1364
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001365
1366 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001367 ceph_osdc_cancel_event(rbd_dev->watch_event);
1368 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001369 return ret;
1370}
1371
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001372struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001373 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001374};
1375
1376static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1377{
Alex Elder0ce1a792012-07-03 16:01:18 -05001378 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1379 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001380 return;
1381
Alex Elderbd919d42012-07-13 20:35:11 -05001382 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1383 rbd_dev->header_name, (unsigned long long) notify_id,
1384 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001385}
1386
1387/*
1388 * Request sync osd notify
1389 */
Alex Elder4cb16252012-07-25 09:32:40 -05001390static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001391{
1392 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001393 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001394 struct ceph_osd_event *event;
1395 struct rbd_notify_info info;
1396 int payload_len = sizeof(u32) + sizeof(u32);
1397 int ret;
1398
Alex Elder57cfc102012-06-26 12:57:03 -07001399 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1400 if (!ops)
1401 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001402
Alex Elder0ce1a792012-07-03 16:01:18 -05001403 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001404
1405 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1406 (void *)&info, &event);
1407 if (ret < 0)
1408 goto fail;
1409
1410 ops[0].watch.ver = 1;
1411 ops[0].watch.flag = 1;
1412 ops[0].watch.cookie = event->cookie;
1413 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1414 ops[0].watch.timeout = 12;
1415
Alex Elder0ce1a792012-07-03 16:01:18 -05001416 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001417 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001418 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1419 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001420 rbd_dev->header_name,
1421 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001422 if (ret < 0)
1423 goto fail_event;
1424
1425 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1426 dout("ceph_osdc_wait_event returned %d\n", ret);
1427 rbd_destroy_ops(ops);
1428 return 0;
1429
1430fail_event:
1431 ceph_osdc_cancel_event(event);
1432fail:
1433 rbd_destroy_ops(ops);
1434 return ret;
1435}
1436
1437/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001438 * Request sync osd read
1439 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001440static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001441 const char *object_name,
1442 const char *class_name,
1443 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001444 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001445 int len,
1446 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001447{
1448 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001449 int class_name_len = strlen(class_name);
1450 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001451 int ret;
1452
1453 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001454 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001455 if (!ops)
1456 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001457
Alex Elderaded07e2012-07-03 16:01:18 -05001458 ops[0].cls.class_name = class_name;
1459 ops[0].cls.class_len = (__u8) class_name_len;
1460 ops[0].cls.method_name = method_name;
1461 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001462 ops[0].cls.argc = 0;
1463 ops[0].cls.indata = data;
1464 ops[0].cls.indata_len = len;
1465
Alex Elder0ce1a792012-07-03 16:01:18 -05001466 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001467 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001468 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1469 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001470 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001471
1472 rbd_destroy_ops(ops);
1473
1474 dout("cls_exec returned %d\n", ret);
1475 return ret;
1476}
1477
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001478static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1479{
1480 struct rbd_req_coll *coll =
1481 kzalloc(sizeof(struct rbd_req_coll) +
1482 sizeof(struct rbd_req_status) * num_reqs,
1483 GFP_ATOMIC);
1484
1485 if (!coll)
1486 return NULL;
1487 coll->total = num_reqs;
1488 kref_init(&coll->kref);
1489 return coll;
1490}
1491
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001492/*
1493 * block device queue callback
1494 */
1495static void rbd_rq_fn(struct request_queue *q)
1496{
1497 struct rbd_device *rbd_dev = q->queuedata;
1498 struct request *rq;
1499 struct bio_pair *bp = NULL;
1500
Alex Elder00f1f362012-02-07 12:03:36 -06001501 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001502 struct bio *bio;
1503 struct bio *rq_bio, *next_bio = NULL;
1504 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001505 unsigned int size;
1506 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001507 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001508 int num_segs, cur_seg = 0;
1509 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001510 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001512 dout("fetched request\n");
1513
1514 /* filter out block requests we don't understand */
1515 if ((rq->cmd_type != REQ_TYPE_FS)) {
1516 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001517 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001518 }
1519
1520 /* deduce our operation (read, write) */
1521 do_write = (rq_data_dir(rq) == WRITE);
1522
1523 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001524 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001526 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001528 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529 }
1530
1531 spin_unlock_irq(q->queue_lock);
1532
Josh Durgind1d25642011-12-05 14:03:05 -08001533 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001534
Alex Elderf84344f2012-08-31 17:29:51 -05001535 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1536 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001537 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001538 dout("request for non-existent snapshot");
1539 spin_lock_irq(q->queue_lock);
1540 __blk_end_request_all(rq, -ENXIO);
1541 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001542 }
1543
Josh Durgind1d25642011-12-05 14:03:05 -08001544 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1545
1546 up_read(&rbd_dev->header_rwsem);
1547
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 dout("%s 0x%x bytes at 0x%llx\n",
1549 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001550 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001551
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001552 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001553 if (num_segs <= 0) {
1554 spin_lock_irq(q->queue_lock);
1555 __blk_end_request_all(rq, num_segs);
1556 ceph_put_snap_context(snapc);
1557 continue;
1558 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001559 coll = rbd_alloc_coll(num_segs);
1560 if (!coll) {
1561 spin_lock_irq(q->queue_lock);
1562 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001563 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001564 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001565 }
1566
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567 do {
1568 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001569 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001570 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001571 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001572 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1573 op_size, GFP_ATOMIC);
1574 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001575 rbd_coll_end_req_index(rq, coll, cur_seg,
1576 -ENOMEM, op_size);
1577 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001578 }
1579
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001580
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001581 /* init OSD command: write or read */
1582 if (do_write)
1583 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001584 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001585 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001586 op_size, bio,
1587 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001588 else
1589 rbd_req_read(rq, rbd_dev,
Alex Elderf84344f2012-08-31 17:29:51 -05001590 rbd_dev->mapping.snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001591 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001592 op_size, bio,
1593 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001594
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001595next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001596 size -= op_size;
1597 ofs += op_size;
1598
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001599 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001600 rq_bio = next_bio;
1601 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001602 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001603
1604 if (bp)
1605 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001606 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001607
1608 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001609 }
1610}
1611
1612/*
1613 * a queue callback. Makes sure that we don't create a bio that spans across
1614 * multiple osd objects. One exception would be with a single page bios,
1615 * which we handle later at bio_chain_clone
1616 */
1617static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1618 struct bio_vec *bvec)
1619{
1620 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001621 unsigned int chunk_sectors;
1622 sector_t sector;
1623 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001624 int max;
1625
Alex Elder593a9e72012-02-07 12:03:37 -06001626 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1627 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1628 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1629
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001630 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001631 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001632 if (max < 0)
1633 max = 0; /* bio_add cannot handle a negative return */
1634 if (max <= bvec->bv_len && bio_sectors == 0)
1635 return bvec->bv_len;
1636 return max;
1637}
1638
1639static void rbd_free_disk(struct rbd_device *rbd_dev)
1640{
1641 struct gendisk *disk = rbd_dev->disk;
1642
1643 if (!disk)
1644 return;
1645
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001646 if (disk->flags & GENHD_FL_UP)
1647 del_gendisk(disk);
1648 if (disk->queue)
1649 blk_cleanup_queue(disk->queue);
1650 put_disk(disk);
1651}
1652
1653/*
Alex Elder4156d992012-08-02 11:29:46 -05001654 * Read the complete header for the given rbd device.
1655 *
1656 * Returns a pointer to a dynamically-allocated buffer containing
1657 * the complete and validated header. Caller can pass the address
1658 * of a variable that will be filled in with the version of the
1659 * header object at the time it was read.
1660 *
1661 * Returns a pointer-coded errno if a failure occurs.
1662 */
1663static struct rbd_image_header_ondisk *
1664rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1665{
1666 struct rbd_image_header_ondisk *ondisk = NULL;
1667 u32 snap_count = 0;
1668 u64 names_size = 0;
1669 u32 want_count;
1670 int ret;
1671
1672 /*
1673 * The complete header will include an array of its 64-bit
1674 * snapshot ids, followed by the names of those snapshots as
1675 * a contiguous block of NUL-terminated strings. Note that
1676 * the number of snapshots could change by the time we read
1677 * it in, in which case we re-read it.
1678 */
1679 do {
1680 size_t size;
1681
1682 kfree(ondisk);
1683
1684 size = sizeof (*ondisk);
1685 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1686 size += names_size;
1687 ondisk = kmalloc(size, GFP_KERNEL);
1688 if (!ondisk)
1689 return ERR_PTR(-ENOMEM);
1690
1691 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1692 rbd_dev->header_name,
1693 0, size,
1694 (char *) ondisk, version);
1695
1696 if (ret < 0)
1697 goto out_err;
1698 if (WARN_ON((size_t) ret < size)) {
1699 ret = -ENXIO;
1700 pr_warning("short header read for image %s"
1701 " (want %zd got %d)\n",
1702 rbd_dev->image_name, size, ret);
1703 goto out_err;
1704 }
1705 if (!rbd_dev_ondisk_valid(ondisk)) {
1706 ret = -ENXIO;
1707 pr_warning("invalid header for image %s\n",
1708 rbd_dev->image_name);
1709 goto out_err;
1710 }
1711
1712 names_size = le64_to_cpu(ondisk->snap_names_len);
1713 want_count = snap_count;
1714 snap_count = le32_to_cpu(ondisk->snap_count);
1715 } while (snap_count != want_count);
1716
1717 return ondisk;
1718
1719out_err:
1720 kfree(ondisk);
1721
1722 return ERR_PTR(ret);
1723}
1724
1725/*
1726 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001727 */
1728static int rbd_read_header(struct rbd_device *rbd_dev,
1729 struct rbd_image_header *header)
1730{
Alex Elder4156d992012-08-02 11:29:46 -05001731 struct rbd_image_header_ondisk *ondisk;
1732 u64 ver = 0;
1733 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001734
Alex Elder4156d992012-08-02 11:29:46 -05001735 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1736 if (IS_ERR(ondisk))
1737 return PTR_ERR(ondisk);
1738 ret = rbd_header_from_disk(header, ondisk);
1739 if (ret >= 0)
1740 header->obj_version = ver;
1741 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001742
Alex Elder4156d992012-08-02 11:29:46 -05001743 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744}
1745
1746/*
1747 * create a snapshot
1748 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001749static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001750 const char *snap_name,
1751 gfp_t gfp_flags)
1752{
1753 int name_len = strlen(snap_name);
1754 u64 new_snapid;
1755 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001756 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001757 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001758
1759 /* we should create a snapshot only if we're pointing at the head */
Alex Elderf84344f2012-08-31 17:29:51 -05001760 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001761 return -EINVAL;
1762
Alex Elder0ce1a792012-07-03 16:01:18 -05001763 monc = &rbd_dev->rbd_client->client->monc;
1764 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001765 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001766 if (ret < 0)
1767 return ret;
1768
1769 data = kmalloc(name_len + 16, gfp_flags);
1770 if (!data)
1771 return -ENOMEM;
1772
Sage Weil916d4d62011-05-12 16:10:50 -07001773 p = data;
1774 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775
Sage Weil916d4d62011-05-12 16:10:50 -07001776 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1777 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001778
Alex Elder0bed54d2012-07-03 16:01:18 -05001779 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001780 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001781 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001782
Sage Weil916d4d62011-05-12 16:10:50 -07001783 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001784
Alex Elder505cbb92012-07-19 08:49:18 -05001785 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786bad:
1787 return -ERANGE;
1788}
1789
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001790static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1791{
1792 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001793 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001794
Alex Eldera0593292012-07-19 09:09:27 -05001795 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001796 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001797}
1798
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799/*
1800 * only read the first part of the ondisk header, without the snaps info
1801 */
Alex Elderb8136232012-07-25 09:32:41 -05001802static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001803{
1804 int ret;
1805 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001806
1807 ret = rbd_read_header(rbd_dev, &h);
1808 if (ret < 0)
1809 return ret;
1810
Josh Durgina51aa0c2011-12-05 10:35:04 -08001811 down_write(&rbd_dev->header_rwsem);
1812
Sage Weil9db4b3e2011-04-19 22:49:06 -07001813 /* resized? */
Alex Elderf84344f2012-08-31 17:29:51 -05001814 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
Josh Durgin474ef7c2011-11-21 17:13:54 -08001815 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1816
Alex Elder99c1f082012-08-30 14:42:15 -05001817 if (size != (sector_t) rbd_dev->mapping.size) {
1818 dout("setting size to %llu sectors",
1819 (unsigned long long) size);
1820 rbd_dev->mapping.size = (u64) size;
1821 set_capacity(rbd_dev->disk, size);
1822 }
Josh Durgin474ef7c2011-11-21 17:13:54 -08001823 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001824
Alex Elder849b4262012-07-09 21:04:24 -05001825 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001826 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001827 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001828 /* osd requests may still refer to snapc */
1829 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001830
Alex Elderb8136232012-07-25 09:32:41 -05001831 if (hver)
1832 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001833 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001834 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001835 rbd_dev->header.snapc = h.snapc;
1836 rbd_dev->header.snap_names = h.snap_names;
1837 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001838 /* Free the extra copy of the object prefix */
1839 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1840 kfree(h.object_prefix);
1841
Alex Elder9fcbb802012-08-23 23:48:49 -05001842 ret = rbd_dev_snap_devs_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001843
Josh Durginc6666012011-11-21 17:11:12 -08001844 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001845
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001846 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001847}
1848
Alex Elder1fe5e992012-07-25 09:32:41 -05001849static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1850{
1851 int ret;
1852
1853 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1854 ret = __rbd_refresh_header(rbd_dev, hver);
1855 mutex_unlock(&ctl_mutex);
1856
1857 return ret;
1858}
1859
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001860static int rbd_init_disk(struct rbd_device *rbd_dev)
1861{
1862 struct gendisk *disk;
1863 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001864 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001865
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001866 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001867 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1868 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001869 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001870
Alex Elderf0f8cef2012-01-29 13:57:44 -06001871 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001872 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001873 disk->major = rbd_dev->major;
1874 disk->first_minor = 0;
1875 disk->fops = &rbd_bd_ops;
1876 disk->private_data = rbd_dev;
1877
1878 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001879 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1880 if (!q)
1881 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001882
Alex Elder593a9e72012-02-07 12:03:37 -06001883 /* We use the default size, but let's be explicit about it. */
1884 blk_queue_physical_block_size(q, SECTOR_SIZE);
1885
Josh Durgin029bcbd2011-07-22 11:35:23 -07001886 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001887 segment_size = rbd_obj_bytes(&rbd_dev->header);
1888 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1889 blk_queue_max_segment_size(q, segment_size);
1890 blk_queue_io_min(q, segment_size);
1891 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001892
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001893 blk_queue_merge_bvec(q, rbd_merge_bvec);
1894 disk->queue = q;
1895
1896 q->queuedata = rbd_dev;
1897
1898 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001899
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001900 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001901out_disk:
1902 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001903
1904 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001905}
1906
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001907/*
1908 sysfs
1909*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001910
Alex Elder593a9e72012-02-07 12:03:37 -06001911static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1912{
1913 return container_of(dev, struct rbd_device, dev);
1914}
1915
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001916static ssize_t rbd_size_show(struct device *dev,
1917 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001918{
Alex Elder593a9e72012-02-07 12:03:37 -06001919 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001920 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001921
Josh Durgina51aa0c2011-12-05 10:35:04 -08001922 down_read(&rbd_dev->header_rwsem);
1923 size = get_capacity(rbd_dev->disk);
1924 up_read(&rbd_dev->header_rwsem);
1925
1926 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001927}
1928
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001929static ssize_t rbd_major_show(struct device *dev,
1930 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001931{
Alex Elder593a9e72012-02-07 12:03:37 -06001932 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001933
1934 return sprintf(buf, "%d\n", rbd_dev->major);
1935}
1936
1937static ssize_t rbd_client_id_show(struct device *dev,
1938 struct device_attribute *attr, char *buf)
1939{
Alex Elder593a9e72012-02-07 12:03:37 -06001940 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001941
Alex Elder1dbb4392012-01-24 10:08:37 -06001942 return sprintf(buf, "client%lld\n",
1943 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001944}
1945
1946static ssize_t rbd_pool_show(struct device *dev,
1947 struct device_attribute *attr, char *buf)
1948{
Alex Elder593a9e72012-02-07 12:03:37 -06001949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001950
1951 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1952}
1953
Alex Elder9bb2f332012-07-12 10:46:35 -05001954static ssize_t rbd_pool_id_show(struct device *dev,
1955 struct device_attribute *attr, char *buf)
1956{
1957 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1958
1959 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1960}
1961
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001962static ssize_t rbd_name_show(struct device *dev,
1963 struct device_attribute *attr, char *buf)
1964{
Alex Elder593a9e72012-02-07 12:03:37 -06001965 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001966
Alex Elder0bed54d2012-07-03 16:01:18 -05001967 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001968}
1969
1970static ssize_t rbd_snap_show(struct device *dev,
1971 struct device_attribute *attr,
1972 char *buf)
1973{
Alex Elder593a9e72012-02-07 12:03:37 -06001974 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001975
Alex Elderf84344f2012-08-31 17:29:51 -05001976 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001977}
1978
1979static ssize_t rbd_image_refresh(struct device *dev,
1980 struct device_attribute *attr,
1981 const char *buf,
1982 size_t size)
1983{
Alex Elder593a9e72012-02-07 12:03:37 -06001984 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001985 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001986
Alex Elder1fe5e992012-07-25 09:32:41 -05001987 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001988
1989 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001990}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001991
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001992static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1993static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1994static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1995static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001996static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001997static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1998static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1999static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2000static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002001
2002static struct attribute *rbd_attrs[] = {
2003 &dev_attr_size.attr,
2004 &dev_attr_major.attr,
2005 &dev_attr_client_id.attr,
2006 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002007 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002008 &dev_attr_name.attr,
2009 &dev_attr_current_snap.attr,
2010 &dev_attr_refresh.attr,
2011 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002012 NULL
2013};
2014
2015static struct attribute_group rbd_attr_group = {
2016 .attrs = rbd_attrs,
2017};
2018
2019static const struct attribute_group *rbd_attr_groups[] = {
2020 &rbd_attr_group,
2021 NULL
2022};
2023
2024static void rbd_sysfs_dev_release(struct device *dev)
2025{
2026}
2027
2028static struct device_type rbd_device_type = {
2029 .name = "rbd",
2030 .groups = rbd_attr_groups,
2031 .release = rbd_sysfs_dev_release,
2032};
2033
2034
2035/*
2036 sysfs - snapshots
2037*/
2038
2039static ssize_t rbd_snap_size_show(struct device *dev,
2040 struct device_attribute *attr,
2041 char *buf)
2042{
2043 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2044
Josh Durgin35915382011-12-05 18:25:13 -08002045 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002046}
2047
2048static ssize_t rbd_snap_id_show(struct device *dev,
2049 struct device_attribute *attr,
2050 char *buf)
2051{
2052 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2053
Josh Durgin35915382011-12-05 18:25:13 -08002054 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002055}
2056
2057static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2058static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2059
2060static struct attribute *rbd_snap_attrs[] = {
2061 &dev_attr_snap_size.attr,
2062 &dev_attr_snap_id.attr,
2063 NULL,
2064};
2065
2066static struct attribute_group rbd_snap_attr_group = {
2067 .attrs = rbd_snap_attrs,
2068};
2069
2070static void rbd_snap_dev_release(struct device *dev)
2071{
2072 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2073 kfree(snap->name);
2074 kfree(snap);
2075}
2076
2077static const struct attribute_group *rbd_snap_attr_groups[] = {
2078 &rbd_snap_attr_group,
2079 NULL
2080};
2081
2082static struct device_type rbd_snap_device_type = {
2083 .groups = rbd_snap_attr_groups,
2084 .release = rbd_snap_dev_release,
2085};
2086
Alex Elder14e70852012-07-19 09:09:27 -05002087static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002088{
2089 list_del(&snap->node);
2090 device_unregister(&snap->dev);
2091}
2092
Alex Elder14e70852012-07-19 09:09:27 -05002093static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002094 struct device *parent)
2095{
2096 struct device *dev = &snap->dev;
2097 int ret;
2098
2099 dev->type = &rbd_snap_device_type;
2100 dev->parent = parent;
2101 dev->release = rbd_snap_dev_release;
2102 dev_set_name(dev, "snap_%s", snap->name);
2103 ret = device_register(dev);
2104
2105 return ret;
2106}
2107
Alex Elder4e891e02012-07-10 20:30:10 -05002108static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2109 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002110{
Alex Elder4e891e02012-07-10 20:30:10 -05002111 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002112 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002113
2114 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002115 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002116 return ERR_PTR(-ENOMEM);
2117
2118 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002120 if (!snap->name)
2121 goto err;
2122
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002123 snap->size = rbd_dev->header.snap_sizes[i];
2124 snap->id = rbd_dev->header.snapc->snaps[i];
2125 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002126 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002127 if (ret < 0)
2128 goto err;
2129 }
Alex Elder4e891e02012-07-10 20:30:10 -05002130
2131 return snap;
2132
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002133err:
2134 kfree(snap->name);
2135 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002136
2137 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002138}
2139
2140/*
Alex Elder35938152012-08-02 11:29:46 -05002141 * Scan the rbd device's current snapshot list and compare it to the
2142 * newly-received snapshot context. Remove any existing snapshots
2143 * not present in the new snapshot context. Add a new snapshot for
2144 * any snaphots in the snapshot context not in the current list.
2145 * And verify there are no changes to snapshots we already know
2146 * about.
2147 *
2148 * Assumes the snapshots in the snapshot context are sorted by
2149 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2150 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002151 */
Alex Elder9fcbb802012-08-23 23:48:49 -05002152static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002153{
Alex Elder35938152012-08-02 11:29:46 -05002154 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2155 const u32 snap_count = snapc->num_snaps;
2156 char *snap_name = rbd_dev->header.snap_names;
2157 struct list_head *head = &rbd_dev->snaps;
2158 struct list_head *links = head->next;
2159 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002160
Alex Elder9fcbb802012-08-23 23:48:49 -05002161 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002162 while (index < snap_count || links != head) {
2163 u64 snap_id;
2164 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002165
Alex Elder35938152012-08-02 11:29:46 -05002166 snap_id = index < snap_count ? snapc->snaps[index]
2167 : CEPH_NOSNAP;
2168 snap = links != head ? list_entry(links, struct rbd_snap, node)
2169 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05002170 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002171
Alex Elder35938152012-08-02 11:29:46 -05002172 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2173 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002174
Alex Elder35938152012-08-02 11:29:46 -05002175 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002176
Alex Elderf84344f2012-08-31 17:29:51 -05002177 if (rbd_dev->mapping.snap_id == snap->id)
2178 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002179 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002180 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002181 rbd_dev->mapping.snap_id == snap->id ?
2182 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002183 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002184
Alex Elder35938152012-08-02 11:29:46 -05002185 /* Done with this list entry; advance */
2186
2187 links = next;
2188 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002189 }
Alex Elder35938152012-08-02 11:29:46 -05002190
Alex Elder9fcbb802012-08-23 23:48:49 -05002191 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2192 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002193 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2194 struct rbd_snap *new_snap;
2195
2196 /* We haven't seen this snapshot before */
2197
2198 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2199 snap_name);
Alex Elder9fcbb802012-08-23 23:48:49 -05002200 if (IS_ERR(new_snap)) {
2201 int err = PTR_ERR(new_snap);
2202
2203 dout(" failed to add dev, error %d\n", err);
2204
2205 return err;
2206 }
Alex Elder35938152012-08-02 11:29:46 -05002207
2208 /* New goes before existing, or at end of list */
2209
Alex Elder9fcbb802012-08-23 23:48:49 -05002210 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002211 if (snap)
2212 list_add_tail(&new_snap->node, &snap->node);
2213 else
Alex Elder523f3252012-08-30 00:16:37 -05002214 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002215 } else {
2216 /* Already have this one */
2217
Alex Elder9fcbb802012-08-23 23:48:49 -05002218 dout(" already present\n");
2219
Alex Elderaafb230e2012-09-06 16:00:54 -05002220 rbd_assert(snap->size ==
2221 rbd_dev->header.snap_sizes[index]);
2222 rbd_assert(!strcmp(snap->name, snap_name));
Alex Elder35938152012-08-02 11:29:46 -05002223
2224 /* Done with this list entry; advance */
2225
2226 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002227 }
Alex Elder35938152012-08-02 11:29:46 -05002228
2229 /* Advance to the next entry in the snapshot context */
2230
2231 index++;
2232 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002233 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002234 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002235
2236 return 0;
2237}
2238
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002239static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2240{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002241 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002242 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002243
2244 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002245
Alex Eldercd789ab2012-08-30 00:16:38 -05002246 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002247 dev->bus = &rbd_bus_type;
2248 dev->type = &rbd_device_type;
2249 dev->parent = &rbd_root_dev;
2250 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002251 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002252 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002253
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002254 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002255
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002256 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002257}
2258
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002259static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2260{
2261 device_unregister(&rbd_dev->dev);
2262}
2263
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002264static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2265{
2266 int ret, rc;
2267
2268 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002269 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002270 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002271 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002272 if (rc < 0)
2273 return rc;
2274 }
2275 } while (ret == -ERANGE);
2276
2277 return ret;
2278}
2279
Alex Eldere2839302012-08-29 17:11:06 -05002280static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002281
2282/*
Alex Elder499afd52012-02-02 08:13:29 -06002283 * Get a unique rbd identifier for the given new rbd_dev, and add
2284 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002285 */
Alex Eldere2839302012-08-29 17:11:06 -05002286static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002287{
Alex Eldere2839302012-08-29 17:11:06 -05002288 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002289
2290 spin_lock(&rbd_dev_list_lock);
2291 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2292 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002293 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2294 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002295}
Alex Elderb7f23c32012-01-29 13:57:43 -06002296
Alex Elder1ddbe942012-01-29 13:57:44 -06002297/*
Alex Elder499afd52012-02-02 08:13:29 -06002298 * Remove an rbd_dev from the global list, and record that its
2299 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002300 */
Alex Eldere2839302012-08-29 17:11:06 -05002301static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002302{
Alex Elderd184f6b2012-01-29 13:57:44 -06002303 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002304 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002305 int max_id;
2306
Alex Elderaafb230e2012-09-06 16:00:54 -05002307 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002308
Alex Eldere2839302012-08-29 17:11:06 -05002309 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2310 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002311 spin_lock(&rbd_dev_list_lock);
2312 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002313
2314 /*
2315 * If the id being "put" is not the current maximum, there
2316 * is nothing special we need to do.
2317 */
Alex Eldere2839302012-08-29 17:11:06 -05002318 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002319 spin_unlock(&rbd_dev_list_lock);
2320 return;
2321 }
2322
2323 /*
2324 * We need to update the current maximum id. Search the
2325 * list to find out what it is. We're more likely to find
2326 * the maximum at the end, so search the list backward.
2327 */
2328 max_id = 0;
2329 list_for_each_prev(tmp, &rbd_dev_list) {
2330 struct rbd_device *rbd_dev;
2331
2332 rbd_dev = list_entry(tmp, struct rbd_device, node);
2333 if (rbd_id > max_id)
2334 max_id = rbd_id;
2335 }
Alex Elder499afd52012-02-02 08:13:29 -06002336 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002337
Alex Elder1ddbe942012-01-29 13:57:44 -06002338 /*
Alex Eldere2839302012-08-29 17:11:06 -05002339 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002340 * which case it now accurately reflects the new maximum.
2341 * Be careful not to overwrite the maximum value in that
2342 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002343 */
Alex Eldere2839302012-08-29 17:11:06 -05002344 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2345 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002346}
2347
Alex Eldera725f65e2012-02-02 08:13:30 -06002348/*
Alex Eldere28fff262012-02-02 08:13:30 -06002349 * Skips over white space at *buf, and updates *buf to point to the
2350 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002351 * the token (string of non-white space characters) found. Note
2352 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002353 */
2354static inline size_t next_token(const char **buf)
2355{
2356 /*
2357 * These are the characters that produce nonzero for
2358 * isspace() in the "C" and "POSIX" locales.
2359 */
2360 const char *spaces = " \f\n\r\t\v";
2361
2362 *buf += strspn(*buf, spaces); /* Find start of token */
2363
2364 return strcspn(*buf, spaces); /* Return token length */
2365}
2366
2367/*
2368 * Finds the next token in *buf, and if the provided token buffer is
2369 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002370 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2371 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002372 *
2373 * Returns the length of the token found (not including the '\0').
2374 * Return value will be 0 if no token is found, and it will be >=
2375 * token_size if the token would not fit.
2376 *
Alex Elder593a9e72012-02-07 12:03:37 -06002377 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002378 * found token. Note that this occurs even if the token buffer is
2379 * too small to hold it.
2380 */
2381static inline size_t copy_token(const char **buf,
2382 char *token,
2383 size_t token_size)
2384{
2385 size_t len;
2386
2387 len = next_token(buf);
2388 if (len < token_size) {
2389 memcpy(token, *buf, len);
2390 *(token + len) = '\0';
2391 }
2392 *buf += len;
2393
2394 return len;
2395}
2396
2397/*
Alex Elderea3352f2012-07-09 21:04:23 -05002398 * Finds the next token in *buf, dynamically allocates a buffer big
2399 * enough to hold a copy of it, and copies the token into the new
2400 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2401 * that a duplicate buffer is created even for a zero-length token.
2402 *
2403 * Returns a pointer to the newly-allocated duplicate, or a null
2404 * pointer if memory for the duplicate was not available. If
2405 * the lenp argument is a non-null pointer, the length of the token
2406 * (not including the '\0') is returned in *lenp.
2407 *
2408 * If successful, the *buf pointer will be updated to point beyond
2409 * the end of the found token.
2410 *
2411 * Note: uses GFP_KERNEL for allocation.
2412 */
2413static inline char *dup_token(const char **buf, size_t *lenp)
2414{
2415 char *dup;
2416 size_t len;
2417
2418 len = next_token(buf);
2419 dup = kmalloc(len + 1, GFP_KERNEL);
2420 if (!dup)
2421 return NULL;
2422
2423 memcpy(dup, *buf, len);
2424 *(dup + len) = '\0';
2425 *buf += len;
2426
2427 if (lenp)
2428 *lenp = len;
2429
2430 return dup;
2431}
2432
2433/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002434 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2435 * rbd_md_name, and name fields of the given rbd_dev, based on the
2436 * list of monitor addresses and other options provided via
2437 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2438 * copy of the snapshot name to map if successful, or a
2439 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002440 *
2441 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002442 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002443static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2444 const char *buf,
2445 const char **mon_addrs,
2446 size_t *mon_addrs_size,
2447 char *options,
2448 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002449{
Alex Elderd22f76e2012-07-12 10:46:35 -05002450 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002451 char *err_ptr = ERR_PTR(-EINVAL);
2452 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002453
2454 /* The first four tokens are required */
2455
Alex Elder7ef32142012-02-02 08:13:30 -06002456 len = next_token(&buf);
2457 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002458 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002459 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002460 *mon_addrs = buf;
2461
2462 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002463
Alex Eldere28fff262012-02-02 08:13:30 -06002464 len = copy_token(&buf, options, options_size);
2465 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002466 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002467
Alex Elder3feeb8942012-08-31 17:29:52 -05002468 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002469 rbd_dev->pool_name = dup_token(&buf, NULL);
2470 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002471 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002472
Alex Elder0bed54d2012-07-03 16:01:18 -05002473 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2474 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002475 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002476
Alex Elder3feeb8942012-08-31 17:29:52 -05002477 /* Snapshot name is optional */
2478 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002479 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002480 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2481 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002482 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002483 snap_name = kmalloc(len + 1, GFP_KERNEL);
2484 if (!snap_name)
2485 goto out_err;
2486 memcpy(snap_name, buf, len);
2487 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002488
Alex Elder3feeb8942012-08-31 17:29:52 -05002489dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2490
2491 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002492
2493out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002494 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002495 rbd_dev->image_name = NULL;
2496 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002497 kfree(rbd_dev->pool_name);
2498 rbd_dev->pool_name = NULL;
2499
Alex Elder3feeb8942012-08-31 17:29:52 -05002500 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002501}
2502
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002503static ssize_t rbd_add(struct bus_type *bus,
2504 const char *buf,
2505 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002506{
Alex Eldercb8627c2012-07-09 21:04:23 -05002507 char *options;
2508 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002509 const char *mon_addrs = NULL;
2510 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002511 struct ceph_osd_client *osdc;
2512 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05002513 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002514
2515 if (!try_module_get(THIS_MODULE))
2516 return -ENODEV;
2517
Alex Elder27cc2592012-02-02 08:13:30 -06002518 options = kmalloc(count, GFP_KERNEL);
2519 if (!options)
2520 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002521 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2522 if (!rbd_dev)
2523 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002524
2525 /* static rbd_device initialization */
2526 spin_lock_init(&rbd_dev->lock);
2527 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002528 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002529 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002530
Alex Elderd184f6b2012-01-29 13:57:44 -06002531 /* generate unique id: find highest unique id, add one */
Alex Eldere2839302012-08-29 17:11:06 -05002532 rbd_dev_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002533
Alex Eldera725f65e2012-02-02 08:13:30 -06002534 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002535 BUILD_BUG_ON(DEV_NAME_LEN
2536 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002537 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a822012-01-29 13:57:44 -06002538
Alex Eldera725f65e2012-02-02 08:13:30 -06002539 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05002540 snap_name = rbd_add_parse_args(rbd_dev, buf,
2541 &mon_addrs, &mon_addrs_size, options, count);
2542 if (IS_ERR(snap_name)) {
2543 rc = PTR_ERR(snap_name);
Alex Eldera725f65e2012-02-02 08:13:30 -06002544 goto err_put_id;
Alex Elder3feeb8942012-08-31 17:29:52 -05002545 }
Alex Eldera725f65e2012-02-02 08:13:30 -06002546
Alex Elderf8c38922012-08-10 13:12:07 -07002547 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2548 if (rc < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002549 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002550
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002551 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002552 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002553 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2554 if (rc < 0)
2555 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002556 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002557
Alex Elder3fcf2582012-07-03 16:01:19 -05002558 /* Create the name of the header object */
2559
2560 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
2561 + sizeof (RBD_SUFFIX),
2562 GFP_KERNEL);
2563 if (!rbd_dev->header_name)
2564 goto err_out_client;
2565 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2566
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002567 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002568 rc = register_blkdev(0, rbd_dev->name);
2569 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002570 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002571 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002572
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002573 rc = rbd_bus_add_dev(rbd_dev);
2574 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002575 goto err_out_blkdev;
2576
Alex Elder32eec682012-02-08 16:11:14 -06002577 /*
2578 * At this point cleanup in the event of an error is the job
2579 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06002580 */
Alex Elder2ac4e752012-07-10 20:30:10 -05002581
2582 /* contact OSD, request size info about the object being mapped */
2583 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
2584 if (rc)
2585 goto err_out_bus;
2586
2587 /* no need to lock here, as rbd_dev is not registered yet */
2588 rc = rbd_dev_snap_devs_update(rbd_dev);
2589 if (rc)
2590 goto err_out_bus;
2591
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002592 down_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05002593 rc = rbd_header_set_snap(rbd_dev, snap_name);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002594 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05002595 if (rc)
2596 goto err_out_bus;
2597
2598 /* Set up the blkdev mapping. */
2599
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002600 rc = rbd_init_disk(rbd_dev);
2601 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002602 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002603
Alex Elder2ac4e752012-07-10 20:30:10 -05002604 /* Everything's ready. Announce the disk to the world. */
2605
2606 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2607 add_disk(rbd_dev->disk);
2608 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2609 (unsigned long long) rbd_dev->mapping.size);
2610
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002611 rc = rbd_init_watch_dev(rbd_dev);
2612 if (rc)
2613 goto err_out_bus;
2614
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002615 return count;
2616
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002617err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002618 /* this will also clean up rest of rbd_dev stuff */
2619
2620 rbd_bus_del_dev(rbd_dev);
2621 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002622 return rc;
2623
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002624err_out_blkdev:
2625 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2626err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05002627 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002628 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002629err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002630 if (rbd_dev->pool_name) {
Alex Elderf84344f2012-08-31 17:29:51 -05002631 kfree(rbd_dev->mapping.snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002632 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002633 kfree(rbd_dev->pool_name);
2634 }
Alex Eldere2839302012-08-29 17:11:06 -05002635 rbd_dev_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002636err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002637 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002638 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002639
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002640 dout("Error adding device %s\n", buf);
2641 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002642
2643 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002644}
2645
Alex Elderde71a292012-07-03 16:01:19 -05002646static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002647{
2648 struct list_head *tmp;
2649 struct rbd_device *rbd_dev;
2650
Alex Eldere124a822012-01-29 13:57:44 -06002651 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002652 list_for_each(tmp, &rbd_dev_list) {
2653 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002654 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002655 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002656 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002657 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002658 }
Alex Eldere124a822012-01-29 13:57:44 -06002659 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002660 return NULL;
2661}
2662
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002663static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002664{
Alex Elder593a9e72012-02-07 12:03:37 -06002665 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666
Alex Elder1dbb4392012-01-24 10:08:37 -06002667 if (rbd_dev->watch_request) {
2668 struct ceph_client *client = rbd_dev->rbd_client->client;
2669
2670 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002671 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002672 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002673 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002674 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002675
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002676 rbd_put_client(rbd_dev);
2677
2678 /* clean up and free blkdev */
2679 rbd_free_disk(rbd_dev);
2680 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002681
Alex Elder2ac4e752012-07-10 20:30:10 -05002682 /* release allocated disk header fields */
2683 rbd_header_free(&rbd_dev->header);
2684
Alex Elder32eec682012-02-08 16:11:14 -06002685 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05002686 kfree(rbd_dev->mapping.snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002687 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002688 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002689 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05002690 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002691 kfree(rbd_dev);
2692
2693 /* release module ref */
2694 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002695}
2696
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002697static ssize_t rbd_remove(struct bus_type *bus,
2698 const char *buf,
2699 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002700{
2701 struct rbd_device *rbd_dev = NULL;
2702 int target_id, rc;
2703 unsigned long ul;
2704 int ret = count;
2705
2706 rc = strict_strtoul(buf, 10, &ul);
2707 if (rc)
2708 return rc;
2709
2710 /* convert to int; abort if we lost anything in the conversion */
2711 target_id = (int) ul;
2712 if (target_id != ul)
2713 return -EINVAL;
2714
2715 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2716
2717 rbd_dev = __rbd_get_dev(target_id);
2718 if (!rbd_dev) {
2719 ret = -ENOENT;
2720 goto done;
2721 }
2722
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002723 __rbd_remove_all_snaps(rbd_dev);
2724 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002725
2726done:
2727 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05002728
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002729 return ret;
2730}
2731
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002732static ssize_t rbd_snap_add(struct device *dev,
2733 struct device_attribute *attr,
2734 const char *buf,
2735 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002736{
Alex Elder593a9e72012-02-07 12:03:37 -06002737 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002738 int ret;
2739 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002740 if (!name)
2741 return -ENOMEM;
2742
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002743 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002744
2745 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2746
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002747 ret = rbd_header_add_snap(rbd_dev,
2748 name, GFP_KERNEL);
2749 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002750 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002751
Alex Elderb8136232012-07-25 09:32:41 -05002752 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002753 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002754 goto err_unlock;
2755
2756 /* shouldn't hold ctl_mutex when notifying.. notify might
2757 trigger a watch callback that would need to get that mutex */
2758 mutex_unlock(&ctl_mutex);
2759
2760 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002761 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002762
2763 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002764 kfree(name);
2765 return ret;
2766
2767err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002768 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002769 kfree(name);
2770 return ret;
2771}
2772
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002773/*
2774 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002775 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002776 */
2777static int rbd_sysfs_init(void)
2778{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002779 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002780
Alex Elderfed4c142012-02-07 12:03:36 -06002781 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002782 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002783 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002784
Alex Elderfed4c142012-02-07 12:03:36 -06002785 ret = bus_register(&rbd_bus_type);
2786 if (ret < 0)
2787 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002788
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002789 return ret;
2790}
2791
2792static void rbd_sysfs_cleanup(void)
2793{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002794 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002795 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002796}
2797
2798int __init rbd_init(void)
2799{
2800 int rc;
2801
2802 rc = rbd_sysfs_init();
2803 if (rc)
2804 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002805 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002806 return 0;
2807}
2808
2809void __exit rbd_exit(void)
2810{
2811 rbd_sysfs_cleanup();
2812}
2813
2814module_init(rbd_init);
2815module_exit(rbd_exit);
2816
2817MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2818MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2819MODULE_DESCRIPTION("rados block device");
2820
2821/* following authorship retained from original osdblk.c */
2822MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2823
2824MODULE_LICENSE("GPL");