blob: 144694ee03a59fd2e344c03c76eb333991cd6143 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb230e2012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder81a89792012-02-02 08:13:30 -060069/*
70 * An RBD device name will be "rbd#", where the "rbd" comes from
71 * RBD_DRV_NAME above, and # is a unique integer identifier.
72 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
73 * enough to hold all possible device names.
74 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070075#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060076#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070077
Alex Eldercc0538b2012-08-10 13:12:07 -070078#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070079
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080/*
81 * block device image metadata (in-memory version)
82 */
83struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050084 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050085 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070086 __u8 obj_order;
87 __u8 crypt_type;
88 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070089
Alex Elderf84344f2012-08-31 17:29:51 -050090 /* The remaining fields need to be updated occasionally */
91 u64 image_size;
92 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093 char *snap_names;
94 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070095
96 u64 obj_version;
97};
98
99struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700100 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101};
102
103/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600104 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 */
106struct rbd_client {
107 struct ceph_client *client;
108 struct kref kref;
109 struct list_head node;
110};
111
112/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600113 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700114 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700115struct rbd_req_status {
116 int done;
117 int rc;
118 u64 bytes;
119};
120
121/*
122 * a collection of requests
123 */
124struct rbd_req_coll {
125 int total;
126 int num_done;
127 struct kref kref;
128 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700129};
130
Alex Elderf0f8cef2012-01-29 13:57:44 -0600131/*
132 * a single io request
133 */
134struct rbd_request {
135 struct request *rq; /* blk layer request */
136 struct bio *bio; /* cloned bio */
137 struct page **pages; /* list of used pages */
138 u64 len;
139 int coll_index;
140 struct rbd_req_coll *coll;
141};
142
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800143struct rbd_snap {
144 struct device dev;
145 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800146 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800147 struct list_head node;
148 u64 id;
149};
150
Alex Elderf84344f2012-08-31 17:29:51 -0500151struct rbd_mapping {
152 char *snap_name;
153 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500154 u64 size;
Alex Elderf84344f2012-08-31 17:29:51 -0500155 bool snap_exists;
156 bool read_only;
157};
158
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700159/*
160 * a single device
161 */
162struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500163 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700164
165 int major; /* blkdev assigned major */
166 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700167
Alex Elderf8c38922012-08-10 13:12:07 -0700168 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169 struct rbd_client *rbd_client;
170
171 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
172
173 spinlock_t lock; /* queue lock */
174
175 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500176 char *image_name;
177 size_t image_name_len;
178 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500179 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500180 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700182 struct ceph_osd_event *watch_event;
183 struct ceph_osd_request *watch_request;
184
Josh Durginc6666012011-11-21 17:11:12 -0800185 /* protects updating the header */
186 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500187
188 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700189
190 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800191
192 /* list of snapshots */
193 struct list_head snaps;
194
195 /* sysfs related */
196 struct device dev;
197};
198
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600200
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700201static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600202static DEFINE_SPINLOCK(rbd_dev_list_lock);
203
Alex Elder432b8582012-01-29 13:57:44 -0600204static LIST_HEAD(rbd_client_list); /* clients */
205static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700206
Alex Elder9fcbb802012-08-23 23:48:49 -0500207static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800208static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800209static ssize_t rbd_snap_add(struct device *dev,
210 struct device_attribute *attr,
211 const char *buf,
212 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500213static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800214
Alex Elderf0f8cef2012-01-29 13:57:44 -0600215static ssize_t rbd_add(struct bus_type *bus, const char *buf,
216 size_t count);
217static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
218 size_t count);
219
220static struct bus_attribute rbd_bus_attrs[] = {
221 __ATTR(add, S_IWUSR, NULL, rbd_add),
222 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
223 __ATTR_NULL
224};
225
226static struct bus_type rbd_bus_type = {
227 .name = "rbd",
228 .bus_attrs = rbd_bus_attrs,
229};
230
231static void rbd_root_dev_release(struct device *dev)
232{
233}
234
235static struct device rbd_root_dev = {
236 .init_name = "rbd",
237 .release = rbd_root_dev_release,
238};
239
Alex Elderaafb230e2012-09-06 16:00:54 -0500240#ifdef RBD_DEBUG
241#define rbd_assert(expr) \
242 if (unlikely(!(expr))) { \
243 printk(KERN_ERR "\nAssertion failure in %s() " \
244 "at line %d:\n\n" \
245 "\trbd_assert(%s);\n\n", \
246 __func__, __LINE__, #expr); \
247 BUG(); \
248 }
249#else /* !RBD_DEBUG */
250# define rbd_assert(expr) ((void) 0)
251#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800252
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800253static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
254{
255 return get_device(&rbd_dev->dev);
256}
257
258static void rbd_put_dev(struct rbd_device *rbd_dev)
259{
260 put_device(&rbd_dev->dev);
261}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700262
Alex Elder1fe5e992012-07-25 09:32:41 -0500263static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700264
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700265static int rbd_open(struct block_device *bdev, fmode_t mode)
266{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600267 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700268
Alex Elderf84344f2012-08-31 17:29:51 -0500269 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270 return -EROFS;
271
Alex Elder340c7a22012-08-10 13:12:07 -0700272 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500273 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700274
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700275 return 0;
276}
277
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800278static int rbd_release(struct gendisk *disk, fmode_t mode)
279{
280 struct rbd_device *rbd_dev = disk->private_data;
281
282 rbd_put_dev(rbd_dev);
283
284 return 0;
285}
286
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700287static const struct block_device_operations rbd_bd_ops = {
288 .owner = THIS_MODULE,
289 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800290 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700291};
292
293/*
294 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500295 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296 */
Alex Elderf8c38922012-08-10 13:12:07 -0700297static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700298{
299 struct rbd_client *rbdc;
300 int ret = -ENOMEM;
301
302 dout("rbd_client_create\n");
303 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
304 if (!rbdc)
305 goto out_opt;
306
307 kref_init(&rbdc->kref);
308 INIT_LIST_HEAD(&rbdc->node);
309
Alex Elderbc534d82012-01-29 13:57:44 -0600310 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
311
Alex Elder43ae4702012-07-03 16:01:18 -0500312 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700313 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600314 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500315 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316
317 ret = ceph_open_session(rbdc->client);
318 if (ret < 0)
319 goto out_err;
320
Alex Elder432b8582012-01-29 13:57:44 -0600321 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600323 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700324
Alex Elderbc534d82012-01-29 13:57:44 -0600325 mutex_unlock(&ctl_mutex);
326
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327 dout("rbd_client_create created %p\n", rbdc);
328 return rbdc;
329
330out_err:
331 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600332out_mutex:
333 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 kfree(rbdc);
335out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500336 if (ceph_opts)
337 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400338 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339}
340
341/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700342 * Find a ceph client with specific addr and configuration. If
343 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700345static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346{
347 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700348 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349
Alex Elder43ae4702012-07-03 16:01:18 -0500350 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351 return NULL;
352
Alex Elder1f7ba332012-08-10 13:12:07 -0700353 spin_lock(&rbd_client_list_lock);
354 list_for_each_entry(client_node, &rbd_client_list, node) {
355 if (!ceph_compare_options(ceph_opts, client_node->client)) {
356 kref_get(&client_node->kref);
357 found = true;
358 break;
359 }
360 }
361 spin_unlock(&rbd_client_list_lock);
362
363 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364}
365
366/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700367 * mount options
368 */
369enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700370 Opt_last_int,
371 /* int args above */
372 Opt_last_string,
373 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700374 Opt_read_only,
375 Opt_read_write,
376 /* Boolean args above */
377 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700378};
379
Alex Elder43ae4702012-07-03 16:01:18 -0500380static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700381 /* int args above */
382 /* string args above */
Alex Elderf84344f2012-08-31 17:29:51 -0500383 {Opt_read_only, "mapping.read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700384 {Opt_read_only, "ro"}, /* Alternate spelling */
385 {Opt_read_write, "read_write"},
386 {Opt_read_write, "rw"}, /* Alternate spelling */
387 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700388 {-1, NULL}
389};
390
391static int parse_rbd_opts_token(char *c, void *private)
392{
Alex Elder43ae4702012-07-03 16:01:18 -0500393 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700394 substring_t argstr[MAX_OPT_ARGS];
395 int token, intval, ret;
396
Alex Elder43ae4702012-07-03 16:01:18 -0500397 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398 if (token < 0)
399 return -EINVAL;
400
401 if (token < Opt_last_int) {
402 ret = match_int(&argstr[0], &intval);
403 if (ret < 0) {
404 pr_err("bad mount option arg (not int) "
405 "at '%s'\n", c);
406 return ret;
407 }
408 dout("got int token %d val %d\n", token, intval);
409 } else if (token > Opt_last_int && token < Opt_last_string) {
410 dout("got string token %d val %s\n", token,
411 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700412 } else if (token > Opt_last_string && token < Opt_last_bool) {
413 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700414 } else {
415 dout("got token %d\n", token);
416 }
417
418 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700419 case Opt_read_only:
420 rbd_opts->read_only = true;
421 break;
422 case Opt_read_write:
423 rbd_opts->read_only = false;
424 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700425 default:
Alex Elderaafb230e2012-09-06 16:00:54 -0500426 rbd_assert(false);
427 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700428 }
429 return 0;
430}
431
432/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433 * Get a ceph client with specific addr and configuration, if one does
434 * not exist create it.
435 */
Alex Elderf8c38922012-08-10 13:12:07 -0700436static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
437 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438{
Alex Elderf8c38922012-08-10 13:12:07 -0700439 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500440 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700441 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700442
Alex Eldercc0538b2012-08-10 13:12:07 -0700443 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444
Alex Elder43ae4702012-07-03 16:01:18 -0500445 ceph_opts = ceph_parse_options(options, mon_addr,
446 mon_addr + mon_addr_len,
447 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700448 if (IS_ERR(ceph_opts))
449 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450
Alex Elder1f7ba332012-08-10 13:12:07 -0700451 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600453 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500454 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700455 } else {
456 rbdc = rbd_client_create(ceph_opts);
457 if (IS_ERR(rbdc))
458 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700459 }
Alex Elderf8c38922012-08-10 13:12:07 -0700460 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700461
Alex Elderf8c38922012-08-10 13:12:07 -0700462 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463}
464
465/*
466 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600467 *
Alex Elder432b8582012-01-29 13:57:44 -0600468 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 */
470static void rbd_client_release(struct kref *kref)
471{
472 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
473
474 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500475 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700476 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500477 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700478
479 ceph_destroy_client(rbdc->client);
480 kfree(rbdc);
481}
482
483/*
484 * Drop reference to ceph client node. If it's not referenced anymore, release
485 * it.
486 */
487static void rbd_put_client(struct rbd_device *rbd_dev)
488{
489 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
490 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700491}
492
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700493/*
494 * Destroy requests collection
495 */
496static void rbd_coll_release(struct kref *kref)
497{
498 struct rbd_req_coll *coll =
499 container_of(kref, struct rbd_req_coll, kref);
500
501 dout("rbd_coll_release %p\n", coll);
502 kfree(coll);
503}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700504
Alex Elder8e94af82012-07-25 09:32:40 -0500505static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
506{
Alex Elder103a1502012-08-02 11:29:45 -0500507 size_t size;
508 u32 snap_count;
509
510 /* The header has to start with the magic rbd header text */
511 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
512 return false;
513
514 /*
515 * The size of a snapshot header has to fit in a size_t, and
516 * that limits the number of snapshots.
517 */
518 snap_count = le32_to_cpu(ondisk->snap_count);
519 size = SIZE_MAX - sizeof (struct ceph_snap_context);
520 if (snap_count > size / sizeof (__le64))
521 return false;
522
523 /*
524 * Not only that, but the size of the entire the snapshot
525 * header must also be representable in a size_t.
526 */
527 size -= snap_count * sizeof (__le64);
528 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
529 return false;
530
531 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500532}
533
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700534/*
535 * Create a new header structure, translate header format from the on-disk
536 * header.
537 */
538static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500539 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540{
Alex Elderccece232012-07-10 20:30:10 -0500541 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500542 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500543 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500544 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545
Alex Elder6a523252012-07-19 17:12:59 -0500546 memset(header, 0, sizeof (*header));
547
Alex Elder103a1502012-08-02 11:29:45 -0500548 snap_count = le32_to_cpu(ondisk->snap_count);
549
Alex Elder58c17b02012-08-23 23:22:06 -0500550 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
551 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500552 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700553 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500554 memcpy(header->object_prefix, ondisk->object_prefix, len);
555 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600556
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500558 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
559
Alex Elder621901d2012-08-23 23:22:06 -0500560 /* Save a copy of the snapshot names */
561
Alex Elderf785cc12012-08-23 23:22:06 -0500562 if (snap_names_len > (u64) SIZE_MAX)
563 return -EIO;
564 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500566 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500567 /*
568 * Note that rbd_dev_v1_header_read() guarantees
569 * the ondisk buffer we're working with has
570 * snap_names_len bytes beyond the end of the
571 * snapshot id array, this memcpy() is safe.
572 */
573 memcpy(header->snap_names, &ondisk->snaps[snap_count],
574 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500575
Alex Elder621901d2012-08-23 23:22:06 -0500576 /* Record each snapshot's size */
577
Alex Elderd2bb24e2012-07-26 23:37:14 -0500578 size = snap_count * sizeof (*header->snap_sizes);
579 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700580 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500581 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500582 for (i = 0; i < snap_count; i++)
583 header->snap_sizes[i] =
584 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 } else {
Alex Elderccece232012-07-10 20:30:10 -0500586 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587 header->snap_names = NULL;
588 header->snap_sizes = NULL;
589 }
Alex Elder849b4262012-07-09 21:04:24 -0500590
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700591 header->obj_order = ondisk->options.order;
592 header->crypt_type = ondisk->options.crypt_type;
593 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500594
Alex Elder621901d2012-08-23 23:22:06 -0500595 /* Allocate and fill in the snapshot context */
596
Alex Elderf84344f2012-08-31 17:29:51 -0500597 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500598 size = sizeof (struct ceph_snap_context);
599 size += snap_count * sizeof (header->snapc->snaps[0]);
600 header->snapc = kzalloc(size, GFP_KERNEL);
601 if (!header->snapc)
602 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700603
604 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500605 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500607 for (i = 0; i < snap_count; i++)
608 header->snapc->snaps[i] =
609 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610
611 return 0;
612
Alex Elder6a523252012-07-19 17:12:59 -0500613out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500614 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500615 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500617 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500618 kfree(header->object_prefix);
619 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500620
Alex Elder00f1f362012-02-07 12:03:36 -0600621 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622}
623
Alex Elder8836b992012-08-30 14:42:15 -0500624static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700625{
626 int i;
Alex Elder8836b992012-08-30 14:42:15 -0500627 struct rbd_image_header *header = &rbd_dev->header;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700628 char *p = header->snap_names;
629
Alex Elderc9aadfe2012-08-30 14:42:15 -0500630 rbd_assert(header->snapc != NULL);
631 for (i = 0; i < header->snapc->num_snaps; i++) {
Alex Elder00f1f362012-02-07 12:03:36 -0600632 if (!strcmp(snap_name, p)) {
633
634 /* Found it. Pass back its id and/or size */
635
Alex Elder8836b992012-08-30 14:42:15 -0500636 rbd_dev->mapping.snap_id = header->snapc->snaps[i];
637 rbd_dev->mapping.size = header->snap_sizes[i];
638
Alex Elder00f1f362012-02-07 12:03:36 -0600639 return i;
640 }
641 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 }
Alex Elder00f1f362012-02-07 12:03:36 -0600643 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644}
645
Alex Elder4e1105a2012-08-31 17:29:52 -0500646static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700647{
Alex Elder78dc4472012-07-19 08:49:18 -0500648 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700649
Alex Elder4e1105a2012-08-31 17:29:52 -0500650 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800651 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500652 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500653 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elderf84344f2012-08-31 17:29:51 -0500654 rbd_dev->mapping.snap_exists = false;
655 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500657 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658 if (ret < 0)
659 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500660 rbd_dev->mapping.snap_exists = true;
661 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500663 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700664
665 ret = 0;
666done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 return ret;
668}
669
670static void rbd_header_free(struct rbd_image_header *header)
671{
Alex Elder849b4262012-07-09 21:04:24 -0500672 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500673 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500675 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500676 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500677 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800678 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500679 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680}
681
Alex Elder65ccfe22012-08-09 10:33:26 -0700682static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683{
Alex Elder65ccfe22012-08-09 10:33:26 -0700684 char *name;
685 u64 segment;
686 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687
Alex Elder65ccfe22012-08-09 10:33:26 -0700688 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
689 if (!name)
690 return NULL;
691 segment = offset >> rbd_dev->header.obj_order;
692 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
693 rbd_dev->header.object_prefix, segment);
694 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
695 pr_err("error formatting segment name for #%llu (%d)\n",
696 segment, ret);
697 kfree(name);
698 name = NULL;
699 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700
Alex Elder65ccfe22012-08-09 10:33:26 -0700701 return name;
702}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703
Alex Elder65ccfe22012-08-09 10:33:26 -0700704static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
705{
706 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707
Alex Elder65ccfe22012-08-09 10:33:26 -0700708 return offset & (segment_size - 1);
709}
710
711static u64 rbd_segment_length(struct rbd_device *rbd_dev,
712 u64 offset, u64 length)
713{
714 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
715
716 offset &= segment_size - 1;
717
Alex Elderaafb230e2012-09-06 16:00:54 -0500718 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700719 if (offset + length > segment_size)
720 length = segment_size - offset;
721
722 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723}
724
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700725static int rbd_get_num_segments(struct rbd_image_header *header,
726 u64 ofs, u64 len)
727{
Alex Elderdf111be2012-08-09 10:33:26 -0700728 u64 start_seg;
729 u64 end_seg;
730
731 if (!len)
732 return 0;
733 if (len - 1 > U64_MAX - ofs)
734 return -ERANGE;
735
736 start_seg = ofs >> header->obj_order;
737 end_seg = (ofs + len - 1) >> header->obj_order;
738
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700739 return end_seg - start_seg + 1;
740}
741
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700743 * returns the size of an object in the image
744 */
745static u64 rbd_obj_bytes(struct rbd_image_header *header)
746{
747 return 1 << header->obj_order;
748}
749
750/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751 * bio helpers
752 */
753
754static void bio_chain_put(struct bio *chain)
755{
756 struct bio *tmp;
757
758 while (chain) {
759 tmp = chain;
760 chain = chain->bi_next;
761 bio_put(tmp);
762 }
763}
764
765/*
766 * zeros a bio chain, starting at specific offset
767 */
768static void zero_bio_chain(struct bio *chain, int start_ofs)
769{
770 struct bio_vec *bv;
771 unsigned long flags;
772 void *buf;
773 int i;
774 int pos = 0;
775
776 while (chain) {
777 bio_for_each_segment(bv, chain, i) {
778 if (pos + bv->bv_len > start_ofs) {
779 int remainder = max(start_ofs - pos, 0);
780 buf = bvec_kmap_irq(bv, &flags);
781 memset(buf + remainder, 0,
782 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200783 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 }
785 pos += bv->bv_len;
786 }
787
788 chain = chain->bi_next;
789 }
790}
791
792/*
793 * bio_chain_clone - clone a chain of bios up to a certain length.
794 * might return a bio_pair that will need to be released.
795 */
796static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
797 struct bio_pair **bp,
798 int len, gfp_t gfpmask)
799{
Alex Elder542582f2012-08-09 10:33:25 -0700800 struct bio *old_chain = *old;
801 struct bio *new_chain = NULL;
802 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803 int total = 0;
804
805 if (*bp) {
806 bio_pair_release(*bp);
807 *bp = NULL;
808 }
809
810 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700811 struct bio *tmp;
812
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700813 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
814 if (!tmp)
815 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700816 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700817
818 if (total + old_chain->bi_size > len) {
819 struct bio_pair *bp;
820
821 /*
822 * this split can only happen with a single paged bio,
823 * split_bio will BUG_ON if this is not the case
824 */
825 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500826 "bi_size=%u\n",
827 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700828
829 /* split the bio. We'll release it either in the next
830 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600831 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700832 if (!bp)
833 goto err_out;
834
835 __bio_clone(tmp, &bp->bio1);
836
837 *next = &bp->bio2;
838 } else {
839 __bio_clone(tmp, old_chain);
840 *next = old_chain->bi_next;
841 }
842
843 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700844 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700845 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700847 else
848 new_chain = tmp;
849 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700850 old_chain = old_chain->bi_next;
851
852 total += tmp->bi_size;
853 }
854
Alex Elderaafb230e2012-09-06 16:00:54 -0500855 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700856
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700857 *old = old_chain;
858
859 return new_chain;
860
861err_out:
862 dout("bio_chain_clone with err\n");
863 bio_chain_put(new_chain);
864 return NULL;
865}
866
867/*
868 * helpers for osd request op vectors.
869 */
Alex Elder57cfc102012-06-26 12:57:03 -0700870static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
871 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700872{
Alex Elder57cfc102012-06-26 12:57:03 -0700873 struct ceph_osd_req_op *ops;
874
875 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
876 if (!ops)
877 return NULL;
878
879 ops[0].op = opcode;
880
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700881 /*
882 * op extent offset and length will be set later on
883 * in calc_raw_layout()
884 */
Alex Elder57cfc102012-06-26 12:57:03 -0700885 ops[0].payload_len = payload_len;
886
887 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888}
889
890static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
891{
892 kfree(ops);
893}
894
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700895static void rbd_coll_end_req_index(struct request *rq,
896 struct rbd_req_coll *coll,
897 int index,
898 int ret, u64 len)
899{
900 struct request_queue *q;
901 int min, max, i;
902
Alex Elderbd919d42012-07-13 20:35:11 -0500903 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
904 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700905
906 if (!rq)
907 return;
908
909 if (!coll) {
910 blk_end_request(rq, ret, len);
911 return;
912 }
913
914 q = rq->q;
915
916 spin_lock_irq(q->queue_lock);
917 coll->status[index].done = 1;
918 coll->status[index].rc = ret;
919 coll->status[index].bytes = len;
920 max = min = coll->num_done;
921 while (max < coll->total && coll->status[max].done)
922 max++;
923
924 for (i = min; i<max; i++) {
925 __blk_end_request(rq, coll->status[i].rc,
926 coll->status[i].bytes);
927 coll->num_done++;
928 kref_put(&coll->kref, rbd_coll_release);
929 }
930 spin_unlock_irq(q->queue_lock);
931}
932
933static void rbd_coll_end_req(struct rbd_request *req,
934 int ret, u64 len)
935{
936 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
937}
938
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700939/*
940 * Send ceph osd request
941 */
942static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500943 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700944 struct ceph_snap_context *snapc,
945 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500946 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947 struct bio *bio,
948 struct page **pages,
949 int num_pages,
950 int flags,
951 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700952 struct rbd_req_coll *coll,
953 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700954 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700955 struct ceph_msg *msg),
956 struct ceph_osd_request **linger_req,
957 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958{
959 struct ceph_osd_request *req;
960 struct ceph_file_layout *layout;
961 int ret;
962 u64 bno;
963 struct timespec mtime = CURRENT_TIME;
964 struct rbd_request *req_data;
965 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600966 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700969 if (!req_data) {
970 if (coll)
971 rbd_coll_end_req_index(rq, coll, coll_index,
972 -ENOMEM, len);
973 return -ENOMEM;
974 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700976 if (coll) {
977 req_data->coll = coll;
978 req_data->coll_index = coll_index;
979 }
980
Alex Elderbd919d42012-07-13 20:35:11 -0500981 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
982 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700983
Alex Elder0ce1a792012-07-03 16:01:18 -0500984 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600985 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
986 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700987 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700988 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700989 goto done_pages;
990 }
991
992 req->r_callback = rbd_cb;
993
994 req_data->rq = rq;
995 req_data->bio = bio;
996 req_data->pages = pages;
997 req_data->len = len;
998
999 req->r_priv = req_data;
1000
1001 reqhead = req->r_request->front.iov_base;
1002 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1003
Alex Elderaded07e2012-07-03 16:01:18 -05001004 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005 req->r_oid_len = strlen(req->r_oid);
1006
1007 layout = &req->r_file_layout;
1008 memset(layout, 0, sizeof(*layout));
1009 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1010 layout->fl_stripe_count = cpu_to_le32(1);
1011 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001012 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001013 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1014 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015
1016 ceph_osdc_build_request(req, ofs, &len,
1017 ops,
1018 snapc,
1019 &mtime,
1020 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001021
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001022 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001023 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001024 *linger_req = req;
1025 }
1026
Alex Elder1dbb4392012-01-24 10:08:37 -06001027 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028 if (ret < 0)
1029 goto done_err;
1030
1031 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001032 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001033 if (ver)
1034 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001035 dout("reassert_ver=%llu\n",
1036 (unsigned long long)
1037 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001038 ceph_osdc_put_request(req);
1039 }
1040 return ret;
1041
1042done_err:
1043 bio_chain_put(req_data->bio);
1044 ceph_osdc_put_request(req);
1045done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001046 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001047 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001048 return ret;
1049}
1050
1051/*
1052 * Ceph osd op callback
1053 */
1054static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1055{
1056 struct rbd_request *req_data = req->r_priv;
1057 struct ceph_osd_reply_head *replyhead;
1058 struct ceph_osd_op *op;
1059 __s32 rc;
1060 u64 bytes;
1061 int read_op;
1062
1063 /* parse reply */
1064 replyhead = msg->front.iov_base;
1065 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1066 op = (void *)(replyhead + 1);
1067 rc = le32_to_cpu(replyhead->result);
1068 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001069 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001070
Alex Elderbd919d42012-07-13 20:35:11 -05001071 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1072 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001073
1074 if (rc == -ENOENT && read_op) {
1075 zero_bio_chain(req_data->bio, 0);
1076 rc = 0;
1077 } else if (rc == 0 && read_op && bytes < req_data->len) {
1078 zero_bio_chain(req_data->bio, bytes);
1079 bytes = req_data->len;
1080 }
1081
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001082 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001083
1084 if (req_data->bio)
1085 bio_chain_put(req_data->bio);
1086
1087 ceph_osdc_put_request(req);
1088 kfree(req_data);
1089}
1090
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001091static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1092{
1093 ceph_osdc_put_request(req);
1094}
1095
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001096/*
1097 * Do a synchronous ceph osd operation
1098 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001099static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001100 struct ceph_snap_context *snapc,
1101 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001103 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001104 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001105 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001106 char *buf,
1107 struct ceph_osd_request **linger_req,
1108 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109{
1110 int ret;
1111 struct page **pages;
1112 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001113
Alex Elderaafb230e2012-09-06 16:00:54 -05001114 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001115
1116 num_pages = calc_pages_for(ofs , len);
1117 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001118 if (IS_ERR(pages))
1119 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001120
Alex Elder0ce1a792012-07-03 16:01:18 -05001121 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001122 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123 pages, num_pages,
1124 flags,
1125 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001126 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001127 NULL,
1128 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001130 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001131
1132 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1133 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1134
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135done:
1136 ceph_release_page_vector(pages, num_pages);
1137 return ret;
1138}
1139
1140/*
1141 * Do an asynchronous ceph osd operation
1142 */
1143static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001144 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001145 struct ceph_snap_context *snapc,
1146 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001147 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001149 struct bio *bio,
1150 struct rbd_req_coll *coll,
1151 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001152{
1153 char *seg_name;
1154 u64 seg_ofs;
1155 u64 seg_len;
1156 int ret;
1157 struct ceph_osd_req_op *ops;
1158 u32 payload_len;
1159
Alex Elder65ccfe22012-08-09 10:33:26 -07001160 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161 if (!seg_name)
1162 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001163 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1164 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165
1166 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1167
Alex Elder57cfc102012-06-26 12:57:03 -07001168 ret = -ENOMEM;
1169 ops = rbd_create_rw_ops(1, opcode, payload_len);
1170 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171 goto done;
1172
1173 /* we've taken care of segment sizes earlier when we
1174 cloned the bios. We should never have a segment
1175 truncated at this point */
Alex Elderaafb230e2012-09-06 16:00:54 -05001176 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001177
1178 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1179 seg_name, seg_ofs, seg_len,
1180 bio,
1181 NULL, 0,
1182 flags,
1183 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001184 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001185 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001186
1187 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001188done:
1189 kfree(seg_name);
1190 return ret;
1191}
1192
1193/*
1194 * Request async osd write
1195 */
1196static int rbd_req_write(struct request *rq,
1197 struct rbd_device *rbd_dev,
1198 struct ceph_snap_context *snapc,
1199 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001200 struct bio *bio,
1201 struct rbd_req_coll *coll,
1202 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001203{
1204 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1205 CEPH_OSD_OP_WRITE,
1206 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001207 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208}
1209
1210/*
1211 * Request async osd read
1212 */
1213static int rbd_req_read(struct request *rq,
1214 struct rbd_device *rbd_dev,
1215 u64 snapid,
1216 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001217 struct bio *bio,
1218 struct rbd_req_coll *coll,
1219 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001220{
1221 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001222 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223 CEPH_OSD_OP_READ,
1224 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001225 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226}
1227
1228/*
1229 * Request sync osd read
1230 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001231static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001232 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001233 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001235 char *buf,
1236 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001237{
Alex Elder913d2fd2012-06-26 12:57:03 -07001238 struct ceph_osd_req_op *ops;
1239 int ret;
1240
1241 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1242 if (!ops)
1243 return -ENOMEM;
1244
1245 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001246 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001247 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001248 ops, object_name, ofs, len, buf, NULL, ver);
1249 rbd_destroy_ops(ops);
1250
1251 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001252}
1253
1254/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001255 * Request sync osd watch
1256 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001257static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001258 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001259 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260{
1261 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001262 int ret;
1263
Alex Elder57cfc102012-06-26 12:57:03 -07001264 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1265 if (!ops)
1266 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001267
Josh Durgina71b8912011-12-05 18:10:44 -08001268 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269 ops[0].watch.cookie = notify_id;
1270 ops[0].watch.flag = 0;
1271
Alex Elder0ce1a792012-07-03 16:01:18 -05001272 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001273 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001274 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001275 CEPH_OSD_FLAG_READ,
1276 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001277 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001278 rbd_simple_req_cb, 0, NULL);
1279
1280 rbd_destroy_ops(ops);
1281 return ret;
1282}
1283
1284static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1285{
Alex Elder0ce1a792012-07-03 16:01:18 -05001286 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001287 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001288 int rc;
1289
Alex Elder0ce1a792012-07-03 16:01:18 -05001290 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001291 return;
1292
Alex Elderbd919d42012-07-13 20:35:11 -05001293 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1294 rbd_dev->header_name, (unsigned long long) notify_id,
1295 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001296 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001297 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001298 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001299 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001300
Alex Elder7f0a24d2012-07-25 09:32:40 -05001301 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001302}
1303
1304/*
1305 * Request sync osd watch
1306 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001307static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001308{
1309 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001310 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001311 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001312
Alex Elder57cfc102012-06-26 12:57:03 -07001313 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1314 if (!ops)
1315 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316
1317 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001318 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001319 if (ret < 0)
1320 goto fail;
1321
Alex Elder0e6f3222012-07-25 09:32:40 -05001322 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001323 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001324 ops[0].watch.flag = 1;
1325
Alex Elder0ce1a792012-07-03 16:01:18 -05001326 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001328 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1329 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001330 rbd_dev->header_name,
1331 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001332 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001333
1334 if (ret < 0)
1335 goto fail_event;
1336
1337 rbd_destroy_ops(ops);
1338 return 0;
1339
1340fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001341 ceph_osdc_cancel_event(rbd_dev->watch_event);
1342 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343fail:
1344 rbd_destroy_ops(ops);
1345 return ret;
1346}
1347
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001348/*
1349 * Request sync osd unwatch
1350 */
Alex Elder070c6332012-07-25 09:32:41 -05001351static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001352{
1353 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001354 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001355
Alex Elder57cfc102012-06-26 12:57:03 -07001356 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1357 if (!ops)
1358 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001359
1360 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001361 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001362 ops[0].watch.flag = 0;
1363
Alex Elder0ce1a792012-07-03 16:01:18 -05001364 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001365 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001366 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1367 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001368 rbd_dev->header_name,
1369 0, 0, NULL, NULL, NULL);
1370
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001371
1372 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001373 ceph_osdc_cancel_event(rbd_dev->watch_event);
1374 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001375 return ret;
1376}
1377
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001378struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001379 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001380};
1381
1382static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1383{
Alex Elder0ce1a792012-07-03 16:01:18 -05001384 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1385 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386 return;
1387
Alex Elderbd919d42012-07-13 20:35:11 -05001388 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1389 rbd_dev->header_name, (unsigned long long) notify_id,
1390 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001391}
1392
1393/*
1394 * Request sync osd notify
1395 */
Alex Elder4cb16252012-07-25 09:32:40 -05001396static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001397{
1398 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001399 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001400 struct ceph_osd_event *event;
1401 struct rbd_notify_info info;
1402 int payload_len = sizeof(u32) + sizeof(u32);
1403 int ret;
1404
Alex Elder57cfc102012-06-26 12:57:03 -07001405 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1406 if (!ops)
1407 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001408
Alex Elder0ce1a792012-07-03 16:01:18 -05001409 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001410
1411 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1412 (void *)&info, &event);
1413 if (ret < 0)
1414 goto fail;
1415
1416 ops[0].watch.ver = 1;
1417 ops[0].watch.flag = 1;
1418 ops[0].watch.cookie = event->cookie;
1419 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1420 ops[0].watch.timeout = 12;
1421
Alex Elder0ce1a792012-07-03 16:01:18 -05001422 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001423 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001424 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1425 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001426 rbd_dev->header_name,
1427 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001428 if (ret < 0)
1429 goto fail_event;
1430
1431 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1432 dout("ceph_osdc_wait_event returned %d\n", ret);
1433 rbd_destroy_ops(ops);
1434 return 0;
1435
1436fail_event:
1437 ceph_osdc_cancel_event(event);
1438fail:
1439 rbd_destroy_ops(ops);
1440 return ret;
1441}
1442
1443/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001444 * Request sync osd read
1445 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001446static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001447 const char *object_name,
1448 const char *class_name,
1449 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001450 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001451 int len,
1452 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001453{
1454 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001455 int class_name_len = strlen(class_name);
1456 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001457 int ret;
1458
1459 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001460 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001461 if (!ops)
1462 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001463
Alex Elderaded07e2012-07-03 16:01:18 -05001464 ops[0].cls.class_name = class_name;
1465 ops[0].cls.class_len = (__u8) class_name_len;
1466 ops[0].cls.method_name = method_name;
1467 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001468 ops[0].cls.argc = 0;
1469 ops[0].cls.indata = data;
1470 ops[0].cls.indata_len = len;
1471
Alex Elder0ce1a792012-07-03 16:01:18 -05001472 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1475 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001476 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001477
1478 rbd_destroy_ops(ops);
1479
1480 dout("cls_exec returned %d\n", ret);
1481 return ret;
1482}
1483
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001484static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1485{
1486 struct rbd_req_coll *coll =
1487 kzalloc(sizeof(struct rbd_req_coll) +
1488 sizeof(struct rbd_req_status) * num_reqs,
1489 GFP_ATOMIC);
1490
1491 if (!coll)
1492 return NULL;
1493 coll->total = num_reqs;
1494 kref_init(&coll->kref);
1495 return coll;
1496}
1497
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001498/*
1499 * block device queue callback
1500 */
1501static void rbd_rq_fn(struct request_queue *q)
1502{
1503 struct rbd_device *rbd_dev = q->queuedata;
1504 struct request *rq;
1505 struct bio_pair *bp = NULL;
1506
Alex Elder00f1f362012-02-07 12:03:36 -06001507 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001508 struct bio *bio;
1509 struct bio *rq_bio, *next_bio = NULL;
1510 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001511 unsigned int size;
1512 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514 int num_segs, cur_seg = 0;
1515 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001516 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001517
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001518 dout("fetched request\n");
1519
1520 /* filter out block requests we don't understand */
1521 if ((rq->cmd_type != REQ_TYPE_FS)) {
1522 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001523 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001524 }
1525
1526 /* deduce our operation (read, write) */
1527 do_write = (rq_data_dir(rq) == WRITE);
1528
1529 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001530 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001531 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001532 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001534 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 }
1536
1537 spin_unlock_irq(q->queue_lock);
1538
Josh Durgind1d25642011-12-05 14:03:05 -08001539 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001540
Alex Elderf84344f2012-08-31 17:29:51 -05001541 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1542 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001543 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001544 dout("request for non-existent snapshot");
1545 spin_lock_irq(q->queue_lock);
1546 __blk_end_request_all(rq, -ENXIO);
1547 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001548 }
1549
Josh Durgind1d25642011-12-05 14:03:05 -08001550 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1551
1552 up_read(&rbd_dev->header_rwsem);
1553
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001554 dout("%s 0x%x bytes at 0x%llx\n",
1555 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001556 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001557
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001558 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001559 if (num_segs <= 0) {
1560 spin_lock_irq(q->queue_lock);
1561 __blk_end_request_all(rq, num_segs);
1562 ceph_put_snap_context(snapc);
1563 continue;
1564 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001565 coll = rbd_alloc_coll(num_segs);
1566 if (!coll) {
1567 spin_lock_irq(q->queue_lock);
1568 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001569 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001570 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001571 }
1572
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001573 do {
1574 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001575 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001576 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001577 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001578 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1579 op_size, GFP_ATOMIC);
1580 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001581 rbd_coll_end_req_index(rq, coll, cur_seg,
1582 -ENOMEM, op_size);
1583 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001584 }
1585
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001586
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001587 /* init OSD command: write or read */
1588 if (do_write)
1589 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001590 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001591 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001592 op_size, bio,
1593 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001594 else
1595 rbd_req_read(rq, rbd_dev,
Alex Elderf84344f2012-08-31 17:29:51 -05001596 rbd_dev->mapping.snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001597 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001598 op_size, bio,
1599 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001600
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001601next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001602 size -= op_size;
1603 ofs += op_size;
1604
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001605 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001606 rq_bio = next_bio;
1607 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001608 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001609
1610 if (bp)
1611 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001612 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001613
1614 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001615 }
1616}
1617
1618/*
1619 * a queue callback. Makes sure that we don't create a bio that spans across
1620 * multiple osd objects. One exception would be with a single page bios,
1621 * which we handle later at bio_chain_clone
1622 */
1623static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1624 struct bio_vec *bvec)
1625{
1626 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001627 unsigned int chunk_sectors;
1628 sector_t sector;
1629 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001630 int max;
1631
Alex Elder593a9e72012-02-07 12:03:37 -06001632 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1633 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1634 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1635
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001636 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001637 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001638 if (max < 0)
1639 max = 0; /* bio_add cannot handle a negative return */
1640 if (max <= bvec->bv_len && bio_sectors == 0)
1641 return bvec->bv_len;
1642 return max;
1643}
1644
1645static void rbd_free_disk(struct rbd_device *rbd_dev)
1646{
1647 struct gendisk *disk = rbd_dev->disk;
1648
1649 if (!disk)
1650 return;
1651
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001652 if (disk->flags & GENHD_FL_UP)
1653 del_gendisk(disk);
1654 if (disk->queue)
1655 blk_cleanup_queue(disk->queue);
1656 put_disk(disk);
1657}
1658
1659/*
Alex Elder4156d992012-08-02 11:29:46 -05001660 * Read the complete header for the given rbd device.
1661 *
1662 * Returns a pointer to a dynamically-allocated buffer containing
1663 * the complete and validated header. Caller can pass the address
1664 * of a variable that will be filled in with the version of the
1665 * header object at the time it was read.
1666 *
1667 * Returns a pointer-coded errno if a failure occurs.
1668 */
1669static struct rbd_image_header_ondisk *
1670rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1671{
1672 struct rbd_image_header_ondisk *ondisk = NULL;
1673 u32 snap_count = 0;
1674 u64 names_size = 0;
1675 u32 want_count;
1676 int ret;
1677
1678 /*
1679 * The complete header will include an array of its 64-bit
1680 * snapshot ids, followed by the names of those snapshots as
1681 * a contiguous block of NUL-terminated strings. Note that
1682 * the number of snapshots could change by the time we read
1683 * it in, in which case we re-read it.
1684 */
1685 do {
1686 size_t size;
1687
1688 kfree(ondisk);
1689
1690 size = sizeof (*ondisk);
1691 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1692 size += names_size;
1693 ondisk = kmalloc(size, GFP_KERNEL);
1694 if (!ondisk)
1695 return ERR_PTR(-ENOMEM);
1696
1697 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1698 rbd_dev->header_name,
1699 0, size,
1700 (char *) ondisk, version);
1701
1702 if (ret < 0)
1703 goto out_err;
1704 if (WARN_ON((size_t) ret < size)) {
1705 ret = -ENXIO;
1706 pr_warning("short header read for image %s"
1707 " (want %zd got %d)\n",
1708 rbd_dev->image_name, size, ret);
1709 goto out_err;
1710 }
1711 if (!rbd_dev_ondisk_valid(ondisk)) {
1712 ret = -ENXIO;
1713 pr_warning("invalid header for image %s\n",
1714 rbd_dev->image_name);
1715 goto out_err;
1716 }
1717
1718 names_size = le64_to_cpu(ondisk->snap_names_len);
1719 want_count = snap_count;
1720 snap_count = le32_to_cpu(ondisk->snap_count);
1721 } while (snap_count != want_count);
1722
1723 return ondisk;
1724
1725out_err:
1726 kfree(ondisk);
1727
1728 return ERR_PTR(ret);
1729}
1730
1731/*
1732 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001733 */
1734static int rbd_read_header(struct rbd_device *rbd_dev,
1735 struct rbd_image_header *header)
1736{
Alex Elder4156d992012-08-02 11:29:46 -05001737 struct rbd_image_header_ondisk *ondisk;
1738 u64 ver = 0;
1739 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001740
Alex Elder4156d992012-08-02 11:29:46 -05001741 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1742 if (IS_ERR(ondisk))
1743 return PTR_ERR(ondisk);
1744 ret = rbd_header_from_disk(header, ondisk);
1745 if (ret >= 0)
1746 header->obj_version = ver;
1747 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001748
Alex Elder4156d992012-08-02 11:29:46 -05001749 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001750}
1751
1752/*
1753 * create a snapshot
1754 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001755static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001756 const char *snap_name,
1757 gfp_t gfp_flags)
1758{
1759 int name_len = strlen(snap_name);
1760 u64 new_snapid;
1761 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001762 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001763 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001764
1765 /* we should create a snapshot only if we're pointing at the head */
Alex Elderf84344f2012-08-31 17:29:51 -05001766 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001767 return -EINVAL;
1768
Alex Elder0ce1a792012-07-03 16:01:18 -05001769 monc = &rbd_dev->rbd_client->client->monc;
1770 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001771 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001772 if (ret < 0)
1773 return ret;
1774
1775 data = kmalloc(name_len + 16, gfp_flags);
1776 if (!data)
1777 return -ENOMEM;
1778
Sage Weil916d4d62011-05-12 16:10:50 -07001779 p = data;
1780 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001781
Sage Weil916d4d62011-05-12 16:10:50 -07001782 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1783 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001784
Alex Elder0bed54d2012-07-03 16:01:18 -05001785 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001786 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001787 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001788
Sage Weil916d4d62011-05-12 16:10:50 -07001789 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001790
Alex Elder505cbb92012-07-19 08:49:18 -05001791 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001792bad:
1793 return -ERANGE;
1794}
1795
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001796static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1797{
1798 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001799 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001800
Alex Eldera0593292012-07-19 09:09:27 -05001801 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001802 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001803}
1804
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001805/*
1806 * only read the first part of the ondisk header, without the snaps info
1807 */
Alex Elderb8136232012-07-25 09:32:41 -05001808static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001809{
1810 int ret;
1811 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812
1813 ret = rbd_read_header(rbd_dev, &h);
1814 if (ret < 0)
1815 return ret;
1816
Josh Durgina51aa0c2011-12-05 10:35:04 -08001817 down_write(&rbd_dev->header_rwsem);
1818
Sage Weil9db4b3e2011-04-19 22:49:06 -07001819 /* resized? */
Alex Elderf84344f2012-08-31 17:29:51 -05001820 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
Josh Durgin474ef7c2011-11-21 17:13:54 -08001821 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1822
Alex Elder99c1f082012-08-30 14:42:15 -05001823 if (size != (sector_t) rbd_dev->mapping.size) {
1824 dout("setting size to %llu sectors",
1825 (unsigned long long) size);
1826 rbd_dev->mapping.size = (u64) size;
1827 set_capacity(rbd_dev->disk, size);
1828 }
Josh Durgin474ef7c2011-11-21 17:13:54 -08001829 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001830
Alex Elder849b4262012-07-09 21:04:24 -05001831 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001833 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001834 /* osd requests may still refer to snapc */
1835 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001836
Alex Elderb8136232012-07-25 09:32:41 -05001837 if (hver)
1838 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001839 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001840 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001841 rbd_dev->header.snapc = h.snapc;
1842 rbd_dev->header.snap_names = h.snap_names;
1843 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001844 /* Free the extra copy of the object prefix */
1845 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1846 kfree(h.object_prefix);
1847
Alex Elder9fcbb802012-08-23 23:48:49 -05001848 ret = rbd_dev_snap_devs_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001849
Josh Durginc6666012011-11-21 17:11:12 -08001850 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001851
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001852 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001853}
1854
Alex Elder1fe5e992012-07-25 09:32:41 -05001855static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1856{
1857 int ret;
1858
1859 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1860 ret = __rbd_refresh_header(rbd_dev, hver);
1861 mutex_unlock(&ctl_mutex);
1862
1863 return ret;
1864}
1865
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001866static int rbd_init_disk(struct rbd_device *rbd_dev)
1867{
1868 struct gendisk *disk;
1869 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001870 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001871
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001872 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001873 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1874 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001875 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001876
Alex Elderf0f8cef2012-01-29 13:57:44 -06001877 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001878 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001879 disk->major = rbd_dev->major;
1880 disk->first_minor = 0;
1881 disk->fops = &rbd_bd_ops;
1882 disk->private_data = rbd_dev;
1883
1884 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001885 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1886 if (!q)
1887 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001888
Alex Elder593a9e72012-02-07 12:03:37 -06001889 /* We use the default size, but let's be explicit about it. */
1890 blk_queue_physical_block_size(q, SECTOR_SIZE);
1891
Josh Durgin029bcbd2011-07-22 11:35:23 -07001892 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001893 segment_size = rbd_obj_bytes(&rbd_dev->header);
1894 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1895 blk_queue_max_segment_size(q, segment_size);
1896 blk_queue_io_min(q, segment_size);
1897 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001898
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001899 blk_queue_merge_bvec(q, rbd_merge_bvec);
1900 disk->queue = q;
1901
1902 q->queuedata = rbd_dev;
1903
1904 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001905
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001906 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001907out_disk:
1908 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001909
1910 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001911}
1912
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001913/*
1914 sysfs
1915*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001916
Alex Elder593a9e72012-02-07 12:03:37 -06001917static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1918{
1919 return container_of(dev, struct rbd_device, dev);
1920}
1921
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001922static ssize_t rbd_size_show(struct device *dev,
1923 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001924{
Alex Elder593a9e72012-02-07 12:03:37 -06001925 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001926 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001927
Josh Durgina51aa0c2011-12-05 10:35:04 -08001928 down_read(&rbd_dev->header_rwsem);
1929 size = get_capacity(rbd_dev->disk);
1930 up_read(&rbd_dev->header_rwsem);
1931
1932 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001933}
1934
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001935static ssize_t rbd_major_show(struct device *dev,
1936 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001937{
Alex Elder593a9e72012-02-07 12:03:37 -06001938 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001939
1940 return sprintf(buf, "%d\n", rbd_dev->major);
1941}
1942
1943static ssize_t rbd_client_id_show(struct device *dev,
1944 struct device_attribute *attr, char *buf)
1945{
Alex Elder593a9e72012-02-07 12:03:37 -06001946 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001947
Alex Elder1dbb4392012-01-24 10:08:37 -06001948 return sprintf(buf, "client%lld\n",
1949 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001950}
1951
1952static ssize_t rbd_pool_show(struct device *dev,
1953 struct device_attribute *attr, char *buf)
1954{
Alex Elder593a9e72012-02-07 12:03:37 -06001955 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001956
1957 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1958}
1959
Alex Elder9bb2f332012-07-12 10:46:35 -05001960static ssize_t rbd_pool_id_show(struct device *dev,
1961 struct device_attribute *attr, char *buf)
1962{
1963 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1964
1965 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1966}
1967
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001968static ssize_t rbd_name_show(struct device *dev,
1969 struct device_attribute *attr, char *buf)
1970{
Alex Elder593a9e72012-02-07 12:03:37 -06001971 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001972
Alex Elder0bed54d2012-07-03 16:01:18 -05001973 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001974}
1975
1976static ssize_t rbd_snap_show(struct device *dev,
1977 struct device_attribute *attr,
1978 char *buf)
1979{
Alex Elder593a9e72012-02-07 12:03:37 -06001980 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001981
Alex Elderf84344f2012-08-31 17:29:51 -05001982 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001983}
1984
1985static ssize_t rbd_image_refresh(struct device *dev,
1986 struct device_attribute *attr,
1987 const char *buf,
1988 size_t size)
1989{
Alex Elder593a9e72012-02-07 12:03:37 -06001990 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001991 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001992
Alex Elder1fe5e992012-07-25 09:32:41 -05001993 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001994
1995 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001996}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001997
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001998static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1999static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2000static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2001static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002002static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002003static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2004static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2005static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2006static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002007
2008static struct attribute *rbd_attrs[] = {
2009 &dev_attr_size.attr,
2010 &dev_attr_major.attr,
2011 &dev_attr_client_id.attr,
2012 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002013 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002014 &dev_attr_name.attr,
2015 &dev_attr_current_snap.attr,
2016 &dev_attr_refresh.attr,
2017 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002018 NULL
2019};
2020
2021static struct attribute_group rbd_attr_group = {
2022 .attrs = rbd_attrs,
2023};
2024
2025static const struct attribute_group *rbd_attr_groups[] = {
2026 &rbd_attr_group,
2027 NULL
2028};
2029
2030static void rbd_sysfs_dev_release(struct device *dev)
2031{
2032}
2033
2034static struct device_type rbd_device_type = {
2035 .name = "rbd",
2036 .groups = rbd_attr_groups,
2037 .release = rbd_sysfs_dev_release,
2038};
2039
2040
2041/*
2042 sysfs - snapshots
2043*/
2044
2045static ssize_t rbd_snap_size_show(struct device *dev,
2046 struct device_attribute *attr,
2047 char *buf)
2048{
2049 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2050
Josh Durgin35915382011-12-05 18:25:13 -08002051 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002052}
2053
2054static ssize_t rbd_snap_id_show(struct device *dev,
2055 struct device_attribute *attr,
2056 char *buf)
2057{
2058 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2059
Josh Durgin35915382011-12-05 18:25:13 -08002060 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002061}
2062
2063static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2064static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2065
2066static struct attribute *rbd_snap_attrs[] = {
2067 &dev_attr_snap_size.attr,
2068 &dev_attr_snap_id.attr,
2069 NULL,
2070};
2071
2072static struct attribute_group rbd_snap_attr_group = {
2073 .attrs = rbd_snap_attrs,
2074};
2075
2076static void rbd_snap_dev_release(struct device *dev)
2077{
2078 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2079 kfree(snap->name);
2080 kfree(snap);
2081}
2082
2083static const struct attribute_group *rbd_snap_attr_groups[] = {
2084 &rbd_snap_attr_group,
2085 NULL
2086};
2087
2088static struct device_type rbd_snap_device_type = {
2089 .groups = rbd_snap_attr_groups,
2090 .release = rbd_snap_dev_release,
2091};
2092
Alex Elder14e70852012-07-19 09:09:27 -05002093static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002094{
2095 list_del(&snap->node);
2096 device_unregister(&snap->dev);
2097}
2098
Alex Elder14e70852012-07-19 09:09:27 -05002099static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002100 struct device *parent)
2101{
2102 struct device *dev = &snap->dev;
2103 int ret;
2104
2105 dev->type = &rbd_snap_device_type;
2106 dev->parent = parent;
2107 dev->release = rbd_snap_dev_release;
2108 dev_set_name(dev, "snap_%s", snap->name);
2109 ret = device_register(dev);
2110
2111 return ret;
2112}
2113
Alex Elder4e891e02012-07-10 20:30:10 -05002114static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2115 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002116{
Alex Elder4e891e02012-07-10 20:30:10 -05002117 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002118 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002119
2120 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002121 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002122 return ERR_PTR(-ENOMEM);
2123
2124 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002125 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002126 if (!snap->name)
2127 goto err;
2128
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002129 snap->size = rbd_dev->header.snap_sizes[i];
2130 snap->id = rbd_dev->header.snapc->snaps[i];
2131 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002132 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002133 if (ret < 0)
2134 goto err;
2135 }
Alex Elder4e891e02012-07-10 20:30:10 -05002136
2137 return snap;
2138
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002139err:
2140 kfree(snap->name);
2141 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002142
2143 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002144}
2145
2146/*
Alex Elder35938152012-08-02 11:29:46 -05002147 * Scan the rbd device's current snapshot list and compare it to the
2148 * newly-received snapshot context. Remove any existing snapshots
2149 * not present in the new snapshot context. Add a new snapshot for
2150 * any snaphots in the snapshot context not in the current list.
2151 * And verify there are no changes to snapshots we already know
2152 * about.
2153 *
2154 * Assumes the snapshots in the snapshot context are sorted by
2155 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2156 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002157 */
Alex Elder9fcbb802012-08-23 23:48:49 -05002158static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002159{
Alex Elder35938152012-08-02 11:29:46 -05002160 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2161 const u32 snap_count = snapc->num_snaps;
2162 char *snap_name = rbd_dev->header.snap_names;
2163 struct list_head *head = &rbd_dev->snaps;
2164 struct list_head *links = head->next;
2165 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166
Alex Elder9fcbb802012-08-23 23:48:49 -05002167 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002168 while (index < snap_count || links != head) {
2169 u64 snap_id;
2170 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002171
Alex Elder35938152012-08-02 11:29:46 -05002172 snap_id = index < snap_count ? snapc->snaps[index]
2173 : CEPH_NOSNAP;
2174 snap = links != head ? list_entry(links, struct rbd_snap, node)
2175 : NULL;
Alex Elderaafb230e2012-09-06 16:00:54 -05002176 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002177
Alex Elder35938152012-08-02 11:29:46 -05002178 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2179 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002180
Alex Elder35938152012-08-02 11:29:46 -05002181 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002182
Alex Elderf84344f2012-08-31 17:29:51 -05002183 if (rbd_dev->mapping.snap_id == snap->id)
2184 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002185 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002186 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002187 rbd_dev->mapping.snap_id == snap->id ?
2188 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002189 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002190
Alex Elder35938152012-08-02 11:29:46 -05002191 /* Done with this list entry; advance */
2192
2193 links = next;
2194 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002195 }
Alex Elder35938152012-08-02 11:29:46 -05002196
Alex Elder9fcbb802012-08-23 23:48:49 -05002197 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2198 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002199 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2200 struct rbd_snap *new_snap;
2201
2202 /* We haven't seen this snapshot before */
2203
2204 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2205 snap_name);
Alex Elder9fcbb802012-08-23 23:48:49 -05002206 if (IS_ERR(new_snap)) {
2207 int err = PTR_ERR(new_snap);
2208
2209 dout(" failed to add dev, error %d\n", err);
2210
2211 return err;
2212 }
Alex Elder35938152012-08-02 11:29:46 -05002213
2214 /* New goes before existing, or at end of list */
2215
Alex Elder9fcbb802012-08-23 23:48:49 -05002216 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002217 if (snap)
2218 list_add_tail(&new_snap->node, &snap->node);
2219 else
Alex Elder523f3252012-08-30 00:16:37 -05002220 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002221 } else {
2222 /* Already have this one */
2223
Alex Elder9fcbb802012-08-23 23:48:49 -05002224 dout(" already present\n");
2225
Alex Elderaafb230e2012-09-06 16:00:54 -05002226 rbd_assert(snap->size ==
2227 rbd_dev->header.snap_sizes[index]);
2228 rbd_assert(!strcmp(snap->name, snap_name));
Alex Elder35938152012-08-02 11:29:46 -05002229
2230 /* Done with this list entry; advance */
2231
2232 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002233 }
Alex Elder35938152012-08-02 11:29:46 -05002234
2235 /* Advance to the next entry in the snapshot context */
2236
2237 index++;
2238 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002239 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002240 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002241
2242 return 0;
2243}
2244
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002245static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2246{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002247 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002248 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002249
2250 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002251
Alex Eldercd789ab2012-08-30 00:16:38 -05002252 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002253 dev->bus = &rbd_bus_type;
2254 dev->type = &rbd_device_type;
2255 dev->parent = &rbd_root_dev;
2256 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002257 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002258 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002259
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002260 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002261
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002262 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002263}
2264
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002265static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2266{
2267 device_unregister(&rbd_dev->dev);
2268}
2269
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002270static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2271{
2272 int ret, rc;
2273
2274 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002275 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002276 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002277 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002278 if (rc < 0)
2279 return rc;
2280 }
2281 } while (ret == -ERANGE);
2282
2283 return ret;
2284}
2285
Alex Eldere2839302012-08-29 17:11:06 -05002286static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002287
2288/*
Alex Elder499afd52012-02-02 08:13:29 -06002289 * Get a unique rbd identifier for the given new rbd_dev, and add
2290 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002291 */
Alex Eldere2839302012-08-29 17:11:06 -05002292static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002293{
Alex Eldere2839302012-08-29 17:11:06 -05002294 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002295
2296 spin_lock(&rbd_dev_list_lock);
2297 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2298 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002299 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2300 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002301}
Alex Elderb7f23c32012-01-29 13:57:43 -06002302
Alex Elder1ddbe942012-01-29 13:57:44 -06002303/*
Alex Elder499afd52012-02-02 08:13:29 -06002304 * Remove an rbd_dev from the global list, and record that its
2305 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002306 */
Alex Eldere2839302012-08-29 17:11:06 -05002307static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002308{
Alex Elderd184f6b2012-01-29 13:57:44 -06002309 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002310 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002311 int max_id;
2312
Alex Elderaafb230e2012-09-06 16:00:54 -05002313 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002314
Alex Eldere2839302012-08-29 17:11:06 -05002315 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2316 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002317 spin_lock(&rbd_dev_list_lock);
2318 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002319
2320 /*
2321 * If the id being "put" is not the current maximum, there
2322 * is nothing special we need to do.
2323 */
Alex Eldere2839302012-08-29 17:11:06 -05002324 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002325 spin_unlock(&rbd_dev_list_lock);
2326 return;
2327 }
2328
2329 /*
2330 * We need to update the current maximum id. Search the
2331 * list to find out what it is. We're more likely to find
2332 * the maximum at the end, so search the list backward.
2333 */
2334 max_id = 0;
2335 list_for_each_prev(tmp, &rbd_dev_list) {
2336 struct rbd_device *rbd_dev;
2337
2338 rbd_dev = list_entry(tmp, struct rbd_device, node);
2339 if (rbd_id > max_id)
2340 max_id = rbd_id;
2341 }
Alex Elder499afd52012-02-02 08:13:29 -06002342 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002343
Alex Elder1ddbe942012-01-29 13:57:44 -06002344 /*
Alex Eldere2839302012-08-29 17:11:06 -05002345 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002346 * which case it now accurately reflects the new maximum.
2347 * Be careful not to overwrite the maximum value in that
2348 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002349 */
Alex Eldere2839302012-08-29 17:11:06 -05002350 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2351 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002352}
2353
Alex Eldera725f65e2012-02-02 08:13:30 -06002354/*
Alex Eldere28fff262012-02-02 08:13:30 -06002355 * Skips over white space at *buf, and updates *buf to point to the
2356 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002357 * the token (string of non-white space characters) found. Note
2358 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002359 */
2360static inline size_t next_token(const char **buf)
2361{
2362 /*
2363 * These are the characters that produce nonzero for
2364 * isspace() in the "C" and "POSIX" locales.
2365 */
2366 const char *spaces = " \f\n\r\t\v";
2367
2368 *buf += strspn(*buf, spaces); /* Find start of token */
2369
2370 return strcspn(*buf, spaces); /* Return token length */
2371}
2372
2373/*
2374 * Finds the next token in *buf, and if the provided token buffer is
2375 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002376 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2377 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002378 *
2379 * Returns the length of the token found (not including the '\0').
2380 * Return value will be 0 if no token is found, and it will be >=
2381 * token_size if the token would not fit.
2382 *
Alex Elder593a9e72012-02-07 12:03:37 -06002383 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002384 * found token. Note that this occurs even if the token buffer is
2385 * too small to hold it.
2386 */
2387static inline size_t copy_token(const char **buf,
2388 char *token,
2389 size_t token_size)
2390{
2391 size_t len;
2392
2393 len = next_token(buf);
2394 if (len < token_size) {
2395 memcpy(token, *buf, len);
2396 *(token + len) = '\0';
2397 }
2398 *buf += len;
2399
2400 return len;
2401}
2402
2403/*
Alex Elderea3352f2012-07-09 21:04:23 -05002404 * Finds the next token in *buf, dynamically allocates a buffer big
2405 * enough to hold a copy of it, and copies the token into the new
2406 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2407 * that a duplicate buffer is created even for a zero-length token.
2408 *
2409 * Returns a pointer to the newly-allocated duplicate, or a null
2410 * pointer if memory for the duplicate was not available. If
2411 * the lenp argument is a non-null pointer, the length of the token
2412 * (not including the '\0') is returned in *lenp.
2413 *
2414 * If successful, the *buf pointer will be updated to point beyond
2415 * the end of the found token.
2416 *
2417 * Note: uses GFP_KERNEL for allocation.
2418 */
2419static inline char *dup_token(const char **buf, size_t *lenp)
2420{
2421 char *dup;
2422 size_t len;
2423
2424 len = next_token(buf);
2425 dup = kmalloc(len + 1, GFP_KERNEL);
2426 if (!dup)
2427 return NULL;
2428
2429 memcpy(dup, *buf, len);
2430 *(dup + len) = '\0';
2431 *buf += len;
2432
2433 if (lenp)
2434 *lenp = len;
2435
2436 return dup;
2437}
2438
2439/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002440 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2441 * rbd_md_name, and name fields of the given rbd_dev, based on the
2442 * list of monitor addresses and other options provided via
2443 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2444 * copy of the snapshot name to map if successful, or a
2445 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002446 *
2447 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002448 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002449static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2450 const char *buf,
2451 const char **mon_addrs,
2452 size_t *mon_addrs_size,
2453 char *options,
2454 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002455{
Alex Elderd22f76e2012-07-12 10:46:35 -05002456 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002457 char *err_ptr = ERR_PTR(-EINVAL);
2458 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002459
2460 /* The first four tokens are required */
2461
Alex Elder7ef32142012-02-02 08:13:30 -06002462 len = next_token(&buf);
2463 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002464 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002465 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002466 *mon_addrs = buf;
2467
2468 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002469
Alex Eldere28fff262012-02-02 08:13:30 -06002470 len = copy_token(&buf, options, options_size);
2471 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002472 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002473
Alex Elder3feeb8942012-08-31 17:29:52 -05002474 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002475 rbd_dev->pool_name = dup_token(&buf, NULL);
2476 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002477 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002478
Alex Elder0bed54d2012-07-03 16:01:18 -05002479 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2480 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002481 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002482
Alex Eldercb8627c2012-07-09 21:04:23 -05002483 /* Create the name of the header object */
2484
Alex Elder0bed54d2012-07-03 16:01:18 -05002485 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002486 + sizeof (RBD_SUFFIX),
2487 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002488 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002489 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002490 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002491
Alex Elder3feeb8942012-08-31 17:29:52 -05002492 /* Snapshot name is optional */
2493 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002494 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002495 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2496 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002497 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002498 snap_name = kmalloc(len + 1, GFP_KERNEL);
2499 if (!snap_name)
2500 goto out_err;
2501 memcpy(snap_name, buf, len);
2502 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002503
Alex Elder3feeb8942012-08-31 17:29:52 -05002504dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2505
2506 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002507
2508out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002509 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002510 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002511 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002512 rbd_dev->image_name = NULL;
2513 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002514 kfree(rbd_dev->pool_name);
2515 rbd_dev->pool_name = NULL;
2516
Alex Elder3feeb8942012-08-31 17:29:52 -05002517 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002518}
2519
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002520static ssize_t rbd_add(struct bus_type *bus,
2521 const char *buf,
2522 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002523{
Alex Eldercb8627c2012-07-09 21:04:23 -05002524 char *options;
2525 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002526 const char *mon_addrs = NULL;
2527 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002528 struct ceph_osd_client *osdc;
2529 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05002530 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002531
2532 if (!try_module_get(THIS_MODULE))
2533 return -ENODEV;
2534
Alex Elder27cc2592012-02-02 08:13:30 -06002535 options = kmalloc(count, GFP_KERNEL);
2536 if (!options)
2537 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002538 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2539 if (!rbd_dev)
2540 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002541
2542 /* static rbd_device initialization */
2543 spin_lock_init(&rbd_dev->lock);
2544 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002545 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002546 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002547
Alex Elderd184f6b2012-01-29 13:57:44 -06002548 /* generate unique id: find highest unique id, add one */
Alex Eldere2839302012-08-29 17:11:06 -05002549 rbd_dev_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002550
Alex Eldera725f65e2012-02-02 08:13:30 -06002551 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002552 BUILD_BUG_ON(DEV_NAME_LEN
2553 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002554 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a822012-01-29 13:57:44 -06002555
Alex Eldera725f65e2012-02-02 08:13:30 -06002556 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05002557 snap_name = rbd_add_parse_args(rbd_dev, buf,
2558 &mon_addrs, &mon_addrs_size, options, count);
2559 if (IS_ERR(snap_name)) {
2560 rc = PTR_ERR(snap_name);
Alex Eldera725f65e2012-02-02 08:13:30 -06002561 goto err_put_id;
Alex Elder3feeb8942012-08-31 17:29:52 -05002562 }
Alex Eldera725f65e2012-02-02 08:13:30 -06002563
Alex Elderf8c38922012-08-10 13:12:07 -07002564 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2565 if (rc < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002566 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002567
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002568 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002569 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002570 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2571 if (rc < 0)
2572 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002573 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002574
2575 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002576 rc = register_blkdev(0, rbd_dev->name);
2577 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002578 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002579 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002580
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002581 rc = rbd_bus_add_dev(rbd_dev);
2582 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002583 goto err_out_blkdev;
2584
Alex Elder32eec682012-02-08 16:11:14 -06002585 /*
2586 * At this point cleanup in the event of an error is the job
2587 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06002588 */
Alex Elder2ac4e752012-07-10 20:30:10 -05002589
2590 /* contact OSD, request size info about the object being mapped */
2591 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
2592 if (rc)
2593 goto err_out_bus;
2594
2595 /* no need to lock here, as rbd_dev is not registered yet */
2596 rc = rbd_dev_snap_devs_update(rbd_dev);
2597 if (rc)
2598 goto err_out_bus;
2599
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002600 down_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05002601 rc = rbd_header_set_snap(rbd_dev, snap_name);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002602 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05002603 if (rc)
2604 goto err_out_bus;
2605
2606 /* Set up the blkdev mapping. */
2607
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002608 rc = rbd_init_disk(rbd_dev);
2609 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002610 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002611
Alex Elder2ac4e752012-07-10 20:30:10 -05002612 /* Everything's ready. Announce the disk to the world. */
2613
2614 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2615 add_disk(rbd_dev->disk);
2616 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2617 (unsigned long long) rbd_dev->mapping.size);
2618
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002619 rc = rbd_init_watch_dev(rbd_dev);
2620 if (rc)
2621 goto err_out_bus;
2622
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002623 return count;
2624
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002625err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002626 /* this will also clean up rest of rbd_dev stuff */
2627
2628 rbd_bus_del_dev(rbd_dev);
2629 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002630 return rc;
2631
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002632err_out_blkdev:
2633 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2634err_out_client:
2635 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002636err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002637 if (rbd_dev->pool_name) {
Alex Elderf84344f2012-08-31 17:29:51 -05002638 kfree(rbd_dev->mapping.snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002639 kfree(rbd_dev->header_name);
2640 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002641 kfree(rbd_dev->pool_name);
2642 }
Alex Eldere2839302012-08-29 17:11:06 -05002643 rbd_dev_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002644err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002645 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002646 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002647
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002648 dout("Error adding device %s\n", buf);
2649 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002650
2651 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002652}
2653
Alex Elderde71a292012-07-03 16:01:19 -05002654static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002655{
2656 struct list_head *tmp;
2657 struct rbd_device *rbd_dev;
2658
Alex Eldere124a822012-01-29 13:57:44 -06002659 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002660 list_for_each(tmp, &rbd_dev_list) {
2661 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002662 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002663 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002664 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002665 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666 }
Alex Eldere124a822012-01-29 13:57:44 -06002667 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002668 return NULL;
2669}
2670
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002671static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002672{
Alex Elder593a9e72012-02-07 12:03:37 -06002673 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002674
Alex Elder1dbb4392012-01-24 10:08:37 -06002675 if (rbd_dev->watch_request) {
2676 struct ceph_client *client = rbd_dev->rbd_client->client;
2677
2678 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002679 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002680 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002681 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002682 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002683
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002684 rbd_put_client(rbd_dev);
2685
2686 /* clean up and free blkdev */
2687 rbd_free_disk(rbd_dev);
2688 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002689
Alex Elder2ac4e752012-07-10 20:30:10 -05002690 /* release allocated disk header fields */
2691 rbd_header_free(&rbd_dev->header);
2692
Alex Elder32eec682012-02-08 16:11:14 -06002693 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05002694 kfree(rbd_dev->mapping.snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002695 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002696 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002697 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05002698 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002699 kfree(rbd_dev);
2700
2701 /* release module ref */
2702 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002703}
2704
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002705static ssize_t rbd_remove(struct bus_type *bus,
2706 const char *buf,
2707 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002708{
2709 struct rbd_device *rbd_dev = NULL;
2710 int target_id, rc;
2711 unsigned long ul;
2712 int ret = count;
2713
2714 rc = strict_strtoul(buf, 10, &ul);
2715 if (rc)
2716 return rc;
2717
2718 /* convert to int; abort if we lost anything in the conversion */
2719 target_id = (int) ul;
2720 if (target_id != ul)
2721 return -EINVAL;
2722
2723 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2724
2725 rbd_dev = __rbd_get_dev(target_id);
2726 if (!rbd_dev) {
2727 ret = -ENOENT;
2728 goto done;
2729 }
2730
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002731 __rbd_remove_all_snaps(rbd_dev);
2732 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002733
2734done:
2735 mutex_unlock(&ctl_mutex);
Alex Elderaafb230e2012-09-06 16:00:54 -05002736
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002737 return ret;
2738}
2739
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002740static ssize_t rbd_snap_add(struct device *dev,
2741 struct device_attribute *attr,
2742 const char *buf,
2743 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002744{
Alex Elder593a9e72012-02-07 12:03:37 -06002745 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002746 int ret;
2747 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002748 if (!name)
2749 return -ENOMEM;
2750
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002751 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002752
2753 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2754
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002755 ret = rbd_header_add_snap(rbd_dev,
2756 name, GFP_KERNEL);
2757 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002758 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002759
Alex Elderb8136232012-07-25 09:32:41 -05002760 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002761 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002762 goto err_unlock;
2763
2764 /* shouldn't hold ctl_mutex when notifying.. notify might
2765 trigger a watch callback that would need to get that mutex */
2766 mutex_unlock(&ctl_mutex);
2767
2768 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002769 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002770
2771 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002772 kfree(name);
2773 return ret;
2774
2775err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002776 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002777 kfree(name);
2778 return ret;
2779}
2780
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002781/*
2782 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002783 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002784 */
2785static int rbd_sysfs_init(void)
2786{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002787 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002788
Alex Elderfed4c142012-02-07 12:03:36 -06002789 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002790 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002791 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002792
Alex Elderfed4c142012-02-07 12:03:36 -06002793 ret = bus_register(&rbd_bus_type);
2794 if (ret < 0)
2795 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002796
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002797 return ret;
2798}
2799
2800static void rbd_sysfs_cleanup(void)
2801{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002802 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002803 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002804}
2805
2806int __init rbd_init(void)
2807{
2808 int rc;
2809
2810 rc = rbd_sysfs_init();
2811 if (rc)
2812 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002813 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002814 return 0;
2815}
2816
2817void __exit rbd_exit(void)
2818{
2819 rbd_sysfs_cleanup();
2820}
2821
2822module_init(rbd_init);
2823module_exit(rbd_exit);
2824
2825MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2826MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2827MODULE_DESCRIPTION("rados block device");
2828
2829/* following authorship retained from original osdblk.c */
2830MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2831
2832MODULE_LICENSE("GPL");