blob: e5eaa70e88263110421d04019bc416decfea52e0 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
Alex Elder0f1d3f92012-08-02 11:29:44 -050084 u64 snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070085 u32 total_snaps;
86
87 char *snap_names;
88 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070089
90 u64 obj_version;
91};
92
93struct rbd_options {
94 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095};
96
97/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060098 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099 */
100struct rbd_client {
101 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700102 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103 struct kref kref;
104 struct list_head node;
105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700110struct rbd_req_status {
111 int done;
112 int rc;
113 u64 bytes;
114};
115
116/*
117 * a collection of requests
118 */
119struct rbd_req_coll {
120 int total;
121 int num_done;
122 struct kref kref;
123 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700124};
125
Alex Elderf0f8cef2012-01-29 13:57:44 -0600126/*
127 * a single io request
128 */
129struct rbd_request {
130 struct request *rq; /* blk layer request */
131 struct bio *bio; /* cloned bio */
132 struct page **pages; /* list of used pages */
133 u64 len;
134 int coll_index;
135 struct rbd_req_coll *coll;
136};
137
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800138struct rbd_snap {
139 struct device dev;
140 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800141 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800142 struct list_head node;
143 u64 id;
144};
145
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146/*
147 * a single device
148 */
149struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500150 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151
152 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156 struct rbd_client *rbd_client;
157
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
159
160 spinlock_t lock; /* queue lock */
161
162 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500163 char *image_name;
164 size_t image_name_len;
165 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500166 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500167 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700168
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700169 struct ceph_osd_event *watch_event;
170 struct ceph_osd_request *watch_request;
171
Josh Durginc6666012011-11-21 17:11:12 -0800172 /* protects updating the header */
173 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800174 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800176 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800177 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
182 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183
184 /* list of snapshots */
185 struct list_head snaps;
186
187 /* sysfs related */
188 struct device dev;
189};
190
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700191static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600192
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700193static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600194static DEFINE_SPINLOCK(rbd_dev_list_lock);
195
Alex Elder432b8582012-01-29 13:57:44 -0600196static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500205static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800206
Alex Elderf0f8cef2012-01-29 13:57:44 -0600207static ssize_t rbd_add(struct bus_type *bus, const char *buf,
208 size_t count);
209static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
210 size_t count);
211
212static struct bus_attribute rbd_bus_attrs[] = {
213 __ATTR(add, S_IWUSR, NULL, rbd_add),
214 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
215 __ATTR_NULL
216};
217
218static struct bus_type rbd_bus_type = {
219 .name = "rbd",
220 .bus_attrs = rbd_bus_attrs,
221};
222
223static void rbd_root_dev_release(struct device *dev)
224{
225}
226
227static struct device rbd_root_dev = {
228 .init_name = "rbd",
229 .release = rbd_root_dev_release,
230};
231
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
234{
235 return get_device(&rbd_dev->dev);
236}
237
238static void rbd_put_dev(struct rbd_device *rbd_dev)
239{
240 put_device(&rbd_dev->dev);
241}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700242
Alex Elder1fe5e992012-07-25 09:32:41 -0500243static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700244
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700245static int rbd_open(struct block_device *bdev, fmode_t mode)
246{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
250 return -EROFS;
251
Alex Elder340c7a22012-08-10 13:12:07 -0700252 rbd_get_dev(rbd_dev);
253 set_device_ro(bdev, rbd_dev->read_only);
254
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255 return 0;
256}
257
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258static int rbd_release(struct gendisk *disk, fmode_t mode)
259{
260 struct rbd_device *rbd_dev = disk->private_data;
261
262 rbd_put_dev(rbd_dev);
263
264 return 0;
265}
266
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267static const struct block_device_operations rbd_bd_ops = {
268 .owner = THIS_MODULE,
269 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800270 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700271};
272
273/*
274 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500275 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700276 */
Alex Elder43ae4702012-07-03 16:01:18 -0500277static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700278 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279{
280 struct rbd_client *rbdc;
281 int ret = -ENOMEM;
282
283 dout("rbd_client_create\n");
284 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
285 if (!rbdc)
286 goto out_opt;
287
288 kref_init(&rbdc->kref);
289 INIT_LIST_HEAD(&rbdc->node);
290
Alex Elderbc534d82012-01-29 13:57:44 -0600291 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
292
Alex Elder43ae4702012-07-03 16:01:18 -0500293 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600295 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500296 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
298 ret = ceph_open_session(rbdc->client);
299 if (ret < 0)
300 goto out_err;
301
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700302 rbdc->rbd_opts = rbd_opts;
303
Alex Elder432b8582012-01-29 13:57:44 -0600304 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600306 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307
Alex Elderbc534d82012-01-29 13:57:44 -0600308 mutex_unlock(&ctl_mutex);
309
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310 dout("rbd_client_create created %p\n", rbdc);
311 return rbdc;
312
313out_err:
314 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600315out_mutex:
316 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317 kfree(rbdc);
318out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500319 if (ceph_opts)
320 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400321 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322}
323
324/*
325 * Find a ceph client with specific addr and configuration.
326 */
Alex Elder43ae4702012-07-03 16:01:18 -0500327static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328{
329 struct rbd_client *client_node;
330
Alex Elder43ae4702012-07-03 16:01:18 -0500331 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332 return NULL;
333
334 list_for_each_entry(client_node, &rbd_client_list, node)
Alex Elder43ae4702012-07-03 16:01:18 -0500335 if (!ceph_compare_options(ceph_opts, client_node->client))
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336 return client_node;
337 return NULL;
338}
339
340/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700341 * mount options
342 */
343enum {
344 Opt_notify_timeout,
345 Opt_last_int,
346 /* int args above */
347 Opt_last_string,
348 /* string args above */
349};
350
Alex Elder43ae4702012-07-03 16:01:18 -0500351static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700352 {Opt_notify_timeout, "notify_timeout=%d"},
353 /* int args above */
354 /* string args above */
355 {-1, NULL}
356};
357
358static int parse_rbd_opts_token(char *c, void *private)
359{
Alex Elder43ae4702012-07-03 16:01:18 -0500360 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361 substring_t argstr[MAX_OPT_ARGS];
362 int token, intval, ret;
363
Alex Elder43ae4702012-07-03 16:01:18 -0500364 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700365 if (token < 0)
366 return -EINVAL;
367
368 if (token < Opt_last_int) {
369 ret = match_int(&argstr[0], &intval);
370 if (ret < 0) {
371 pr_err("bad mount option arg (not int) "
372 "at '%s'\n", c);
373 return ret;
374 }
375 dout("got int token %d val %d\n", token, intval);
376 } else if (token > Opt_last_int && token < Opt_last_string) {
377 dout("got string token %d val %s\n", token,
378 argstr[0].from);
379 } else {
380 dout("got token %d\n", token);
381 }
382
383 switch (token) {
384 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500385 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700386 break;
387 default:
388 BUG_ON(token);
389 }
390 return 0;
391}
392
393/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394 * Get a ceph client with specific addr and configuration, if one does
395 * not exist create it.
396 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600397static struct rbd_client *rbd_get_client(const char *mon_addr,
398 size_t mon_addr_len,
399 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700400{
401 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500402 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700403 struct rbd_options *rbd_opts;
404
405 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
406 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600407 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408
409 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410
Alex Elder43ae4702012-07-03 16:01:18 -0500411 ceph_opts = ceph_parse_options(options, mon_addr,
412 mon_addr + mon_addr_len,
413 parse_rbd_opts_token, rbd_opts);
414 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600415 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500416 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600417 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700418
Alex Elder432b8582012-01-29 13:57:44 -0600419 spin_lock(&rbd_client_list_lock);
Alex Elder43ae4702012-07-03 16:01:18 -0500420 rbdc = __rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600422 /* using an existing client */
423 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600424 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600425
Alex Elder43ae4702012-07-03 16:01:18 -0500426 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600427 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700428
Alex Elderd720bcb2012-02-02 08:13:30 -0600429 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700430 }
Alex Elder432b8582012-01-29 13:57:44 -0600431 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432
Alex Elder43ae4702012-07-03 16:01:18 -0500433 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600434
Alex Elderd720bcb2012-02-02 08:13:30 -0600435 if (IS_ERR(rbdc))
436 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437
Alex Elderd720bcb2012-02-02 08:13:30 -0600438 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439}
440
441/*
442 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600443 *
Alex Elder432b8582012-01-29 13:57:44 -0600444 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445 */
446static void rbd_client_release(struct kref *kref)
447{
448 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
449
450 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500451 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500453 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700454
455 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700456 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 kfree(rbdc);
458}
459
460/*
461 * Drop reference to ceph client node. If it's not referenced anymore, release
462 * it.
463 */
464static void rbd_put_client(struct rbd_device *rbd_dev)
465{
466 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
467 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468}
469
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700470/*
471 * Destroy requests collection
472 */
473static void rbd_coll_release(struct kref *kref)
474{
475 struct rbd_req_coll *coll =
476 container_of(kref, struct rbd_req_coll, kref);
477
478 dout("rbd_coll_release %p\n", coll);
479 kfree(coll);
480}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481
Alex Elder8e94af82012-07-25 09:32:40 -0500482static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
483{
484 return !memcmp(&ondisk->text,
485 RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT));
486}
487
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488/*
489 * Create a new header structure, translate header format from the on-disk
490 * header.
491 */
492static int rbd_header_from_disk(struct rbd_image_header *header,
493 struct rbd_image_header_ondisk *ondisk,
Alex Eldered63f4f2012-07-19 09:09:27 -0500494 u32 allocated_snaps)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495{
Alex Elderccece232012-07-10 20:30:10 -0500496 u32 snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700497
Alex Elder8e94af82012-07-25 09:32:40 -0500498 if (!rbd_dev_ondisk_valid(ondisk))
Josh Durgin81e759f2011-11-15 14:49:53 -0800499 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800500
Alex Elder00f1f362012-02-07 12:03:36 -0600501 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderccece232012-07-10 20:30:10 -0500502 if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context))
503 / sizeof (u64))
Xi Wang50f7c4c2012-04-20 15:49:44 -0500504 return -EINVAL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700505 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Yan, Zhengf9f9a192012-06-06 09:15:33 -0500506 snap_count * sizeof(u64),
Alex Eldered63f4f2012-07-19 09:09:27 -0500507 GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700508 if (!header->snapc)
509 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600510
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700511 if (snap_count) {
Alex Elderccece232012-07-10 20:30:10 -0500512 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Alex Elder0f1d3f92012-08-02 11:29:44 -0500513 BUG_ON(header->snap_names_len > (u64) SIZE_MAX);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514 header->snap_names = kmalloc(header->snap_names_len,
Alex Eldered63f4f2012-07-19 09:09:27 -0500515 GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700516 if (!header->snap_names)
517 goto err_snapc;
518 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
Alex Eldered63f4f2012-07-19 09:09:27 -0500519 GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700520 if (!header->snap_sizes)
521 goto err_names;
522 } else {
Alex Elderccece232012-07-10 20:30:10 -0500523 WARN_ON(ondisk->snap_names_len);
524 header->snap_names_len = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700525 header->snap_names = NULL;
526 header->snap_sizes = NULL;
527 }
Alex Elder849b4262012-07-09 21:04:24 -0500528
529 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
Alex Eldered63f4f2012-07-19 09:09:27 -0500530 GFP_KERNEL);
Alex Elder849b4262012-07-09 21:04:24 -0500531 if (!header->object_prefix)
532 goto err_sizes;
533
Alex Elderca1e49a2012-07-10 20:30:09 -0500534 memcpy(header->object_prefix, ondisk->block_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700535 sizeof(ondisk->block_name));
Alex Elder849b4262012-07-09 21:04:24 -0500536 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700537
538 header->image_size = le64_to_cpu(ondisk->image_size);
539 header->obj_order = ondisk->options.order;
540 header->crypt_type = ondisk->options.crypt_type;
541 header->comp_type = ondisk->options.comp_type;
542
543 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500544 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545 header->snapc->num_snaps = snap_count;
546 header->total_snaps = snap_count;
547
Alex Elder21079782012-01-24 10:08:36 -0600548 if (snap_count && allocated_snaps == snap_count) {
Alex Elderccece232012-07-10 20:30:10 -0500549 int i;
550
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700551 for (i = 0; i < snap_count; i++) {
552 header->snapc->snaps[i] =
553 le64_to_cpu(ondisk->snaps[i].id);
554 header->snap_sizes[i] =
555 le64_to_cpu(ondisk->snaps[i].image_size);
556 }
557
558 /* copy snapshot names */
Alex Elderccece232012-07-10 20:30:10 -0500559 memcpy(header->snap_names, &ondisk->snaps[snap_count],
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700560 header->snap_names_len);
561 }
562
563 return 0;
564
Alex Elder849b4262012-07-09 21:04:24 -0500565err_sizes:
566 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500567 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700568err_names:
569 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500570 header->snap_names = NULL;
Alex Elderd78fd7a2012-07-26 23:37:14 -0500571 header->snap_names_len = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700572err_snapc:
573 kfree(header->snapc);
Alex Elderccece232012-07-10 20:30:10 -0500574 header->snapc = NULL;
575
Alex Elder00f1f362012-02-07 12:03:36 -0600576 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700577}
578
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700579static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
580 u64 *seq, u64 *size)
581{
582 int i;
583 char *p = header->snap_names;
584
Alex Elder00f1f362012-02-07 12:03:36 -0600585 for (i = 0; i < header->total_snaps; i++) {
586 if (!strcmp(snap_name, p)) {
587
588 /* Found it. Pass back its id and/or size */
589
590 if (seq)
591 *seq = header->snapc->snaps[i];
592 if (size)
593 *size = header->snap_sizes[i];
594 return i;
595 }
596 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597 }
Alex Elder00f1f362012-02-07 12:03:36 -0600598 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599}
600
Alex Elder0ce1a792012-07-03 16:01:18 -0500601static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700602{
Alex Elder78dc4472012-07-19 08:49:18 -0500603 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700604
Alex Elder0ce1a792012-07-03 16:01:18 -0500605 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606
Alex Elder0ce1a792012-07-03 16:01:18 -0500607 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800608 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500609 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800610 rbd_dev->snap_exists = false;
Alex Elder0ce1a792012-07-03 16:01:18 -0500611 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500613 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700614 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500615 u64 snap_id = 0;
616
617 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
618 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619 if (ret < 0)
620 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500621 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800622 rbd_dev->snap_exists = true;
Alex Elder0ce1a792012-07-03 16:01:18 -0500623 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624 }
625
626 ret = 0;
627done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500628 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629 return ret;
630}
631
632static void rbd_header_free(struct rbd_image_header *header)
633{
Alex Elder849b4262012-07-09 21:04:24 -0500634 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500635 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500637 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500638 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500639 header->snap_names = NULL;
640 header->snap_names_len = 0;
Josh Durgind1d25642011-12-05 14:03:05 -0800641 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500642 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643}
644
645/*
646 * get the actual striped segment name, offset and length
647 */
648static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500649 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650 u64 ofs, u64 len,
651 char *seg_name, u64 *segofs)
652{
653 u64 seg = ofs >> header->obj_order;
654
655 if (seg_name)
656 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500657 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658
659 ofs = ofs & ((1 << header->obj_order) - 1);
660 len = min_t(u64, len, (1 << header->obj_order) - ofs);
661
662 if (segofs)
663 *segofs = ofs;
664
665 return len;
666}
667
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700668static int rbd_get_num_segments(struct rbd_image_header *header,
669 u64 ofs, u64 len)
670{
671 u64 start_seg = ofs >> header->obj_order;
672 u64 end_seg = (ofs + len - 1) >> header->obj_order;
673 return end_seg - start_seg + 1;
674}
675
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700676/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700677 * returns the size of an object in the image
678 */
679static u64 rbd_obj_bytes(struct rbd_image_header *header)
680{
681 return 1 << header->obj_order;
682}
683
684/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685 * bio helpers
686 */
687
688static void bio_chain_put(struct bio *chain)
689{
690 struct bio *tmp;
691
692 while (chain) {
693 tmp = chain;
694 chain = chain->bi_next;
695 bio_put(tmp);
696 }
697}
698
699/*
700 * zeros a bio chain, starting at specific offset
701 */
702static void zero_bio_chain(struct bio *chain, int start_ofs)
703{
704 struct bio_vec *bv;
705 unsigned long flags;
706 void *buf;
707 int i;
708 int pos = 0;
709
710 while (chain) {
711 bio_for_each_segment(bv, chain, i) {
712 if (pos + bv->bv_len > start_ofs) {
713 int remainder = max(start_ofs - pos, 0);
714 buf = bvec_kmap_irq(bv, &flags);
715 memset(buf + remainder, 0,
716 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200717 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700718 }
719 pos += bv->bv_len;
720 }
721
722 chain = chain->bi_next;
723 }
724}
725
726/*
727 * bio_chain_clone - clone a chain of bios up to a certain length.
728 * might return a bio_pair that will need to be released.
729 */
730static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
731 struct bio_pair **bp,
732 int len, gfp_t gfpmask)
733{
734 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
735 int total = 0;
736
737 if (*bp) {
738 bio_pair_release(*bp);
739 *bp = NULL;
740 }
741
742 while (old_chain && (total < len)) {
743 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
744 if (!tmp)
745 goto err_out;
746
747 if (total + old_chain->bi_size > len) {
748 struct bio_pair *bp;
749
750 /*
751 * this split can only happen with a single paged bio,
752 * split_bio will BUG_ON if this is not the case
753 */
754 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500755 "bi_size=%u\n",
756 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700757
758 /* split the bio. We'll release it either in the next
759 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600760 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700761 if (!bp)
762 goto err_out;
763
764 __bio_clone(tmp, &bp->bio1);
765
766 *next = &bp->bio2;
767 } else {
768 __bio_clone(tmp, old_chain);
769 *next = old_chain->bi_next;
770 }
771
772 tmp->bi_bdev = NULL;
773 gfpmask &= ~__GFP_WAIT;
774 tmp->bi_next = NULL;
775
776 if (!new_chain) {
777 new_chain = tail = tmp;
778 } else {
779 tail->bi_next = tmp;
780 tail = tmp;
781 }
782 old_chain = old_chain->bi_next;
783
784 total += tmp->bi_size;
785 }
786
787 BUG_ON(total < len);
788
789 if (tail)
790 tail->bi_next = NULL;
791
792 *old = old_chain;
793
794 return new_chain;
795
796err_out:
797 dout("bio_chain_clone with err\n");
798 bio_chain_put(new_chain);
799 return NULL;
800}
801
802/*
803 * helpers for osd request op vectors.
804 */
Alex Elder57cfc102012-06-26 12:57:03 -0700805static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
806 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807{
Alex Elder57cfc102012-06-26 12:57:03 -0700808 struct ceph_osd_req_op *ops;
809
810 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
811 if (!ops)
812 return NULL;
813
814 ops[0].op = opcode;
815
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700816 /*
817 * op extent offset and length will be set later on
818 * in calc_raw_layout()
819 */
Alex Elder57cfc102012-06-26 12:57:03 -0700820 ops[0].payload_len = payload_len;
821
822 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700823}
824
825static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
826{
827 kfree(ops);
828}
829
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700830static void rbd_coll_end_req_index(struct request *rq,
831 struct rbd_req_coll *coll,
832 int index,
833 int ret, u64 len)
834{
835 struct request_queue *q;
836 int min, max, i;
837
Alex Elderbd919d42012-07-13 20:35:11 -0500838 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
839 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700840
841 if (!rq)
842 return;
843
844 if (!coll) {
845 blk_end_request(rq, ret, len);
846 return;
847 }
848
849 q = rq->q;
850
851 spin_lock_irq(q->queue_lock);
852 coll->status[index].done = 1;
853 coll->status[index].rc = ret;
854 coll->status[index].bytes = len;
855 max = min = coll->num_done;
856 while (max < coll->total && coll->status[max].done)
857 max++;
858
859 for (i = min; i<max; i++) {
860 __blk_end_request(rq, coll->status[i].rc,
861 coll->status[i].bytes);
862 coll->num_done++;
863 kref_put(&coll->kref, rbd_coll_release);
864 }
865 spin_unlock_irq(q->queue_lock);
866}
867
868static void rbd_coll_end_req(struct rbd_request *req,
869 int ret, u64 len)
870{
871 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
872}
873
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874/*
875 * Send ceph osd request
876 */
877static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500878 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700879 struct ceph_snap_context *snapc,
880 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500881 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882 struct bio *bio,
883 struct page **pages,
884 int num_pages,
885 int flags,
886 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700887 struct rbd_req_coll *coll,
888 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700890 struct ceph_msg *msg),
891 struct ceph_osd_request **linger_req,
892 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700893{
894 struct ceph_osd_request *req;
895 struct ceph_file_layout *layout;
896 int ret;
897 u64 bno;
898 struct timespec mtime = CURRENT_TIME;
899 struct rbd_request *req_data;
900 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600901 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700902
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700904 if (!req_data) {
905 if (coll)
906 rbd_coll_end_req_index(rq, coll, coll_index,
907 -ENOMEM, len);
908 return -ENOMEM;
909 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700910
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700911 if (coll) {
912 req_data->coll = coll;
913 req_data->coll_index = coll_index;
914 }
915
Alex Elderbd919d42012-07-13 20:35:11 -0500916 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
917 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700918
Alex Elder0ce1a792012-07-03 16:01:18 -0500919 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600920 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
921 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700922 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700923 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700924 goto done_pages;
925 }
926
927 req->r_callback = rbd_cb;
928
929 req_data->rq = rq;
930 req_data->bio = bio;
931 req_data->pages = pages;
932 req_data->len = len;
933
934 req->r_priv = req_data;
935
936 reqhead = req->r_request->front.iov_base;
937 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
938
Alex Elderaded07e2012-07-03 16:01:18 -0500939 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700940 req->r_oid_len = strlen(req->r_oid);
941
942 layout = &req->r_file_layout;
943 memset(layout, 0, sizeof(*layout));
944 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
945 layout->fl_stripe_count = cpu_to_le32(1);
946 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500947 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600948 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
949 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700950
951 ceph_osdc_build_request(req, ofs, &len,
952 ops,
953 snapc,
954 &mtime,
955 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700957 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600958 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700959 *linger_req = req;
960 }
961
Alex Elder1dbb4392012-01-24 10:08:37 -0600962 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963 if (ret < 0)
964 goto done_err;
965
966 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600967 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700968 if (ver)
969 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -0500970 dout("reassert_ver=%llu\n",
971 (unsigned long long)
972 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700973 ceph_osdc_put_request(req);
974 }
975 return ret;
976
977done_err:
978 bio_chain_put(req_data->bio);
979 ceph_osdc_put_request(req);
980done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700981 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700982 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700983 return ret;
984}
985
986/*
987 * Ceph osd op callback
988 */
989static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
990{
991 struct rbd_request *req_data = req->r_priv;
992 struct ceph_osd_reply_head *replyhead;
993 struct ceph_osd_op *op;
994 __s32 rc;
995 u64 bytes;
996 int read_op;
997
998 /* parse reply */
999 replyhead = msg->front.iov_base;
1000 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1001 op = (void *)(replyhead + 1);
1002 rc = le32_to_cpu(replyhead->result);
1003 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001004 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005
Alex Elderbd919d42012-07-13 20:35:11 -05001006 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1007 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001008
1009 if (rc == -ENOENT && read_op) {
1010 zero_bio_chain(req_data->bio, 0);
1011 rc = 0;
1012 } else if (rc == 0 && read_op && bytes < req_data->len) {
1013 zero_bio_chain(req_data->bio, bytes);
1014 bytes = req_data->len;
1015 }
1016
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001017 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001018
1019 if (req_data->bio)
1020 bio_chain_put(req_data->bio);
1021
1022 ceph_osdc_put_request(req);
1023 kfree(req_data);
1024}
1025
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001026static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1027{
1028 ceph_osdc_put_request(req);
1029}
1030
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031/*
1032 * Do a synchronous ceph osd operation
1033 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001034static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035 struct ceph_snap_context *snapc,
1036 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001037 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001038 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001039 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001041 char *buf,
1042 struct ceph_osd_request **linger_req,
1043 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001044{
1045 int ret;
1046 struct page **pages;
1047 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001048
1049 BUG_ON(ops == NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001050
1051 num_pages = calc_pages_for(ofs , len);
1052 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001053 if (IS_ERR(pages))
1054 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001055
Alex Elder0ce1a792012-07-03 16:01:18 -05001056 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001057 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058 pages, num_pages,
1059 flags,
1060 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001061 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001062 NULL,
1063 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001064 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001065 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001066
1067 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1068 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1069
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001070done:
1071 ceph_release_page_vector(pages, num_pages);
1072 return ret;
1073}
1074
1075/*
1076 * Do an asynchronous ceph osd operation
1077 */
1078static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001079 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 struct ceph_snap_context *snapc,
1081 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001082 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001083 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001084 struct bio *bio,
1085 struct rbd_req_coll *coll,
1086 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001087{
1088 char *seg_name;
1089 u64 seg_ofs;
1090 u64 seg_len;
1091 int ret;
1092 struct ceph_osd_req_op *ops;
1093 u32 payload_len;
1094
1095 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1096 if (!seg_name)
1097 return -ENOMEM;
1098
1099 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001100 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001101 ofs, len,
1102 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103
1104 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1105
Alex Elder57cfc102012-06-26 12:57:03 -07001106 ret = -ENOMEM;
1107 ops = rbd_create_rw_ops(1, opcode, payload_len);
1108 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109 goto done;
1110
1111 /* we've taken care of segment sizes earlier when we
1112 cloned the bios. We should never have a segment
1113 truncated at this point */
1114 BUG_ON(seg_len < len);
1115
1116 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1117 seg_name, seg_ofs, seg_len,
1118 bio,
1119 NULL, 0,
1120 flags,
1121 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001122 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001123 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001124
1125 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001126done:
1127 kfree(seg_name);
1128 return ret;
1129}
1130
1131/*
1132 * Request async osd write
1133 */
1134static int rbd_req_write(struct request *rq,
1135 struct rbd_device *rbd_dev,
1136 struct ceph_snap_context *snapc,
1137 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001138 struct bio *bio,
1139 struct rbd_req_coll *coll,
1140 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001141{
1142 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1143 CEPH_OSD_OP_WRITE,
1144 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001145 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001146}
1147
1148/*
1149 * Request async osd read
1150 */
1151static int rbd_req_read(struct request *rq,
1152 struct rbd_device *rbd_dev,
1153 u64 snapid,
1154 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001155 struct bio *bio,
1156 struct rbd_req_coll *coll,
1157 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001158{
1159 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001160 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161 CEPH_OSD_OP_READ,
1162 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001163 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164}
1165
1166/*
1167 * Request sync osd read
1168 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001169static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001171 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001173 char *buf,
1174 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001175{
Alex Elder913d2fd2012-06-26 12:57:03 -07001176 struct ceph_osd_req_op *ops;
1177 int ret;
1178
1179 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1180 if (!ops)
1181 return -ENOMEM;
1182
1183 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001184 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001185 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001186 ops, object_name, ofs, len, buf, NULL, ver);
1187 rbd_destroy_ops(ops);
1188
1189 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001190}
1191
1192/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001193 * Request sync osd watch
1194 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001195static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001196 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001197 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001198{
1199 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001200 int ret;
1201
Alex Elder57cfc102012-06-26 12:57:03 -07001202 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1203 if (!ops)
1204 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001205
Josh Durgina71b8912011-12-05 18:10:44 -08001206 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001207 ops[0].watch.cookie = notify_id;
1208 ops[0].watch.flag = 0;
1209
Alex Elder0ce1a792012-07-03 16:01:18 -05001210 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001211 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001212 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001213 CEPH_OSD_FLAG_READ,
1214 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001215 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001216 rbd_simple_req_cb, 0, NULL);
1217
1218 rbd_destroy_ops(ops);
1219 return ret;
1220}
1221
1222static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1223{
Alex Elder0ce1a792012-07-03 16:01:18 -05001224 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001225 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001226 int rc;
1227
Alex Elder0ce1a792012-07-03 16:01:18 -05001228 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001229 return;
1230
Alex Elderbd919d42012-07-13 20:35:11 -05001231 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1232 rbd_dev->header_name, (unsigned long long) notify_id,
1233 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001234 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001235 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001236 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001237 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001238
Alex Elder7f0a24d2012-07-25 09:32:40 -05001239 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001240}
1241
1242/*
1243 * Request sync osd watch
1244 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001245static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001246{
1247 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001248 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001249 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001250
Alex Elder57cfc102012-06-26 12:57:03 -07001251 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1252 if (!ops)
1253 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001254
1255 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001256 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001257 if (ret < 0)
1258 goto fail;
1259
Alex Elder0e6f3222012-07-25 09:32:40 -05001260 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001261 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001262 ops[0].watch.flag = 1;
1263
Alex Elder0ce1a792012-07-03 16:01:18 -05001264 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001265 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001266 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1267 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001268 rbd_dev->header_name,
1269 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001270 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001271
1272 if (ret < 0)
1273 goto fail_event;
1274
1275 rbd_destroy_ops(ops);
1276 return 0;
1277
1278fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001279 ceph_osdc_cancel_event(rbd_dev->watch_event);
1280 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001281fail:
1282 rbd_destroy_ops(ops);
1283 return ret;
1284}
1285
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001286/*
1287 * Request sync osd unwatch
1288 */
Alex Elder070c6332012-07-25 09:32:41 -05001289static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001290{
1291 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001292 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001293
Alex Elder57cfc102012-06-26 12:57:03 -07001294 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1295 if (!ops)
1296 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001297
1298 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001299 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001300 ops[0].watch.flag = 0;
1301
Alex Elder0ce1a792012-07-03 16:01:18 -05001302 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001303 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001304 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1305 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001306 rbd_dev->header_name,
1307 0, 0, NULL, NULL, NULL);
1308
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001309
1310 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001311 ceph_osdc_cancel_event(rbd_dev->watch_event);
1312 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001313 return ret;
1314}
1315
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001317 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001318};
1319
1320static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1321{
Alex Elder0ce1a792012-07-03 16:01:18 -05001322 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1323 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001324 return;
1325
Alex Elderbd919d42012-07-13 20:35:11 -05001326 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1327 rbd_dev->header_name, (unsigned long long) notify_id,
1328 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001329}
1330
1331/*
1332 * Request sync osd notify
1333 */
Alex Elder4cb16252012-07-25 09:32:40 -05001334static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001335{
1336 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001337 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338 struct ceph_osd_event *event;
1339 struct rbd_notify_info info;
1340 int payload_len = sizeof(u32) + sizeof(u32);
1341 int ret;
1342
Alex Elder57cfc102012-06-26 12:57:03 -07001343 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1344 if (!ops)
1345 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001346
Alex Elder0ce1a792012-07-03 16:01:18 -05001347 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001348
1349 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1350 (void *)&info, &event);
1351 if (ret < 0)
1352 goto fail;
1353
1354 ops[0].watch.ver = 1;
1355 ops[0].watch.flag = 1;
1356 ops[0].watch.cookie = event->cookie;
1357 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1358 ops[0].watch.timeout = 12;
1359
Alex Elder0ce1a792012-07-03 16:01:18 -05001360 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001361 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001362 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1363 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001364 rbd_dev->header_name,
1365 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001366 if (ret < 0)
1367 goto fail_event;
1368
1369 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1370 dout("ceph_osdc_wait_event returned %d\n", ret);
1371 rbd_destroy_ops(ops);
1372 return 0;
1373
1374fail_event:
1375 ceph_osdc_cancel_event(event);
1376fail:
1377 rbd_destroy_ops(ops);
1378 return ret;
1379}
1380
1381/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001382 * Request sync osd read
1383 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001384static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001385 const char *object_name,
1386 const char *class_name,
1387 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001388 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001389 int len,
1390 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001391{
1392 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001393 int class_name_len = strlen(class_name);
1394 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001395 int ret;
1396
1397 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001398 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001399 if (!ops)
1400 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001401
Alex Elderaded07e2012-07-03 16:01:18 -05001402 ops[0].cls.class_name = class_name;
1403 ops[0].cls.class_len = (__u8) class_name_len;
1404 ops[0].cls.method_name = method_name;
1405 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001406 ops[0].cls.argc = 0;
1407 ops[0].cls.indata = data;
1408 ops[0].cls.indata_len = len;
1409
Alex Elder0ce1a792012-07-03 16:01:18 -05001410 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001411 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001412 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1413 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001414 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001415
1416 rbd_destroy_ops(ops);
1417
1418 dout("cls_exec returned %d\n", ret);
1419 return ret;
1420}
1421
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001422static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1423{
1424 struct rbd_req_coll *coll =
1425 kzalloc(sizeof(struct rbd_req_coll) +
1426 sizeof(struct rbd_req_status) * num_reqs,
1427 GFP_ATOMIC);
1428
1429 if (!coll)
1430 return NULL;
1431 coll->total = num_reqs;
1432 kref_init(&coll->kref);
1433 return coll;
1434}
1435
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001436/*
1437 * block device queue callback
1438 */
1439static void rbd_rq_fn(struct request_queue *q)
1440{
1441 struct rbd_device *rbd_dev = q->queuedata;
1442 struct request *rq;
1443 struct bio_pair *bp = NULL;
1444
Alex Elder00f1f362012-02-07 12:03:36 -06001445 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001446 struct bio *bio;
1447 struct bio *rq_bio, *next_bio = NULL;
1448 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001449 unsigned int size;
1450 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001451 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001452 int num_segs, cur_seg = 0;
1453 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001454 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001455
1456 /* peek at request from block layer */
1457 if (!rq)
1458 break;
1459
1460 dout("fetched request\n");
1461
1462 /* filter out block requests we don't understand */
1463 if ((rq->cmd_type != REQ_TYPE_FS)) {
1464 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001465 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001466 }
1467
1468 /* deduce our operation (read, write) */
1469 do_write = (rq_data_dir(rq) == WRITE);
1470
1471 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001472 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473 rq_bio = rq->bio;
1474 if (do_write && rbd_dev->read_only) {
1475 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001476 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001477 }
1478
1479 spin_unlock_irq(q->queue_lock);
1480
Josh Durgind1d25642011-12-05 14:03:05 -08001481 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001482
Josh Durgind1d25642011-12-05 14:03:05 -08001483 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001484 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001485 dout("request for non-existent snapshot");
1486 spin_lock_irq(q->queue_lock);
1487 __blk_end_request_all(rq, -ENXIO);
1488 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001489 }
1490
Josh Durgind1d25642011-12-05 14:03:05 -08001491 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1492
1493 up_read(&rbd_dev->header_rwsem);
1494
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001495 dout("%s 0x%x bytes at 0x%llx\n",
1496 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001497 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001498
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001499 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1500 coll = rbd_alloc_coll(num_segs);
1501 if (!coll) {
1502 spin_lock_irq(q->queue_lock);
1503 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001504 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001505 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001506 }
1507
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001508 do {
1509 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001510 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001512 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513 ofs, size,
1514 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001515 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001516 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1517 op_size, GFP_ATOMIC);
1518 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001519 rbd_coll_end_req_index(rq, coll, cur_seg,
1520 -ENOMEM, op_size);
1521 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522 }
1523
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001524
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 /* init OSD command: write or read */
1526 if (do_write)
1527 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001528 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001530 op_size, bio,
1531 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001532 else
1533 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001534 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001536 op_size, bio,
1537 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001538
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001539next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 size -= op_size;
1541 ofs += op_size;
1542
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001543 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001544 rq_bio = next_bio;
1545 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001546 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001547
1548 if (bp)
1549 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001550 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001551
1552 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553 }
1554}
1555
1556/*
1557 * a queue callback. Makes sure that we don't create a bio that spans across
1558 * multiple osd objects. One exception would be with a single page bios,
1559 * which we handle later at bio_chain_clone
1560 */
1561static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1562 struct bio_vec *bvec)
1563{
1564 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001565 unsigned int chunk_sectors;
1566 sector_t sector;
1567 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001568 int max;
1569
Alex Elder593a9e72012-02-07 12:03:37 -06001570 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1571 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1572 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1573
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001574 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001575 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001576 if (max < 0)
1577 max = 0; /* bio_add cannot handle a negative return */
1578 if (max <= bvec->bv_len && bio_sectors == 0)
1579 return bvec->bv_len;
1580 return max;
1581}
1582
1583static void rbd_free_disk(struct rbd_device *rbd_dev)
1584{
1585 struct gendisk *disk = rbd_dev->disk;
1586
1587 if (!disk)
1588 return;
1589
1590 rbd_header_free(&rbd_dev->header);
1591
1592 if (disk->flags & GENHD_FL_UP)
1593 del_gendisk(disk);
1594 if (disk->queue)
1595 blk_cleanup_queue(disk->queue);
1596 put_disk(disk);
1597}
1598
1599/*
1600 * reload the ondisk the header
1601 */
1602static int rbd_read_header(struct rbd_device *rbd_dev,
1603 struct rbd_image_header *header)
1604{
1605 ssize_t rc;
1606 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001607 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001608 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001609 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001610
Alex Elder00f1f362012-02-07 12:03:36 -06001611 /*
1612 * First reads the fixed-size header to determine the number
1613 * of snapshots, then re-reads it, along with all snapshot
1614 * records as well as their stored names.
1615 */
1616 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001617 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001618 dh = kmalloc(len, GFP_KERNEL);
1619 if (!dh)
1620 return -ENOMEM;
1621
1622 rc = rbd_req_sync_read(rbd_dev,
Alex Elder9a5d6902012-07-19 09:09:27 -05001623 CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001624 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001625 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001626 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001627 if (rc < 0)
1628 goto out_dh;
1629
Alex Eldered63f4f2012-07-19 09:09:27 -05001630 rc = rbd_header_from_disk(header, dh, snap_count);
Josh Durgin81e759f2011-11-15 14:49:53 -08001631 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001632 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001633 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001634 " for image %s\n",
1635 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001636 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001637 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001638
Alex Elder00f1f362012-02-07 12:03:36 -06001639 if (snap_count == header->total_snaps)
1640 break;
1641
1642 snap_count = header->total_snaps;
1643 len = sizeof (*dh) +
1644 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1645 header->snap_names_len;
1646
1647 rbd_header_free(header);
1648 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001649 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001650 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001651
1652out_dh:
1653 kfree(dh);
1654 return rc;
1655}
1656
1657/*
1658 * create a snapshot
1659 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001660static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001661 const char *snap_name,
1662 gfp_t gfp_flags)
1663{
1664 int name_len = strlen(snap_name);
1665 u64 new_snapid;
1666 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001667 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001668 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001669
1670 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001671 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001672 return -EINVAL;
1673
Alex Elder0ce1a792012-07-03 16:01:18 -05001674 monc = &rbd_dev->rbd_client->client->monc;
1675 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001676 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001677 if (ret < 0)
1678 return ret;
1679
1680 data = kmalloc(name_len + 16, gfp_flags);
1681 if (!data)
1682 return -ENOMEM;
1683
Sage Weil916d4d62011-05-12 16:10:50 -07001684 p = data;
1685 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001686
Sage Weil916d4d62011-05-12 16:10:50 -07001687 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1688 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001689
Alex Elder0bed54d2012-07-03 16:01:18 -05001690 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001691 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001692 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001693
Sage Weil916d4d62011-05-12 16:10:50 -07001694 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001695
Alex Elder505cbb92012-07-19 08:49:18 -05001696 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001697bad:
1698 return -ERANGE;
1699}
1700
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001701static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1702{
1703 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001704 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001705
Alex Eldera0593292012-07-19 09:09:27 -05001706 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001707 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001708}
1709
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001710/*
1711 * only read the first part of the ondisk header, without the snaps info
1712 */
Alex Elderb8136232012-07-25 09:32:41 -05001713static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001714{
1715 int ret;
1716 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001717
1718 ret = rbd_read_header(rbd_dev, &h);
1719 if (ret < 0)
1720 return ret;
1721
Josh Durgina51aa0c2011-12-05 10:35:04 -08001722 down_write(&rbd_dev->header_rwsem);
1723
Sage Weil9db4b3e2011-04-19 22:49:06 -07001724 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001725 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1726 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1727
1728 dout("setting size to %llu sectors", (unsigned long long) size);
1729 set_capacity(rbd_dev->disk, size);
1730 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001731
Alex Elder849b4262012-07-09 21:04:24 -05001732 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001733 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001734 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001735 /* osd requests may still refer to snapc */
1736 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001737
Alex Elderb8136232012-07-25 09:32:41 -05001738 if (hver)
1739 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001740 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001741 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001742 rbd_dev->header.total_snaps = h.total_snaps;
1743 rbd_dev->header.snapc = h.snapc;
1744 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001745 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001747 /* Free the extra copy of the object prefix */
1748 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1749 kfree(h.object_prefix);
1750
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001751 ret = __rbd_init_snaps_header(rbd_dev);
1752
Josh Durginc6666012011-11-21 17:11:12 -08001753 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001754
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001755 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001756}
1757
Alex Elder1fe5e992012-07-25 09:32:41 -05001758static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1759{
1760 int ret;
1761
1762 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1763 ret = __rbd_refresh_header(rbd_dev, hver);
1764 mutex_unlock(&ctl_mutex);
1765
1766 return ret;
1767}
1768
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001769static int rbd_init_disk(struct rbd_device *rbd_dev)
1770{
1771 struct gendisk *disk;
1772 struct request_queue *q;
1773 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001774 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775 u64 total_size = 0;
1776
1777 /* contact OSD, request size info about the object being mapped */
1778 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1779 if (rc)
1780 return rc;
1781
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001782 /* no need to lock here, as rbd_dev is not registered yet */
1783 rc = __rbd_init_snaps_header(rbd_dev);
1784 if (rc)
1785 return rc;
1786
Josh Durgincc9d7342011-11-21 18:19:13 -08001787 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001788 if (rc)
1789 return rc;
1790
1791 /* create gendisk info */
1792 rc = -ENOMEM;
1793 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1794 if (!disk)
1795 goto out;
1796
Alex Elderf0f8cef2012-01-29 13:57:44 -06001797 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001798 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799 disk->major = rbd_dev->major;
1800 disk->first_minor = 0;
1801 disk->fops = &rbd_bd_ops;
1802 disk->private_data = rbd_dev;
1803
1804 /* init rq */
1805 rc = -ENOMEM;
1806 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1807 if (!q)
1808 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001809
Alex Elder593a9e72012-02-07 12:03:37 -06001810 /* We use the default size, but let's be explicit about it. */
1811 blk_queue_physical_block_size(q, SECTOR_SIZE);
1812
Josh Durgin029bcbd2011-07-22 11:35:23 -07001813 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001814 segment_size = rbd_obj_bytes(&rbd_dev->header);
1815 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1816 blk_queue_max_segment_size(q, segment_size);
1817 blk_queue_io_min(q, segment_size);
1818 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001819
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001820 blk_queue_merge_bvec(q, rbd_merge_bvec);
1821 disk->queue = q;
1822
1823 q->queuedata = rbd_dev;
1824
1825 rbd_dev->disk = disk;
1826 rbd_dev->q = q;
1827
1828 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001829 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001830 add_disk(disk);
1831
1832 pr_info("%s: added with size 0x%llx\n",
1833 disk->disk_name, (unsigned long long)total_size);
1834 return 0;
1835
1836out_disk:
1837 put_disk(disk);
1838out:
1839 return rc;
1840}
1841
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001842/*
1843 sysfs
1844*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001845
Alex Elder593a9e72012-02-07 12:03:37 -06001846static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1847{
1848 return container_of(dev, struct rbd_device, dev);
1849}
1850
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001851static ssize_t rbd_size_show(struct device *dev,
1852 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001853{
Alex Elder593a9e72012-02-07 12:03:37 -06001854 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001855 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001856
Josh Durgina51aa0c2011-12-05 10:35:04 -08001857 down_read(&rbd_dev->header_rwsem);
1858 size = get_capacity(rbd_dev->disk);
1859 up_read(&rbd_dev->header_rwsem);
1860
1861 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001862}
1863
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001864static ssize_t rbd_major_show(struct device *dev,
1865 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001866{
Alex Elder593a9e72012-02-07 12:03:37 -06001867 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001868
1869 return sprintf(buf, "%d\n", rbd_dev->major);
1870}
1871
1872static ssize_t rbd_client_id_show(struct device *dev,
1873 struct device_attribute *attr, char *buf)
1874{
Alex Elder593a9e72012-02-07 12:03:37 -06001875 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001876
Alex Elder1dbb4392012-01-24 10:08:37 -06001877 return sprintf(buf, "client%lld\n",
1878 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001879}
1880
1881static ssize_t rbd_pool_show(struct device *dev,
1882 struct device_attribute *attr, char *buf)
1883{
Alex Elder593a9e72012-02-07 12:03:37 -06001884 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001885
1886 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1887}
1888
Alex Elder9bb2f332012-07-12 10:46:35 -05001889static ssize_t rbd_pool_id_show(struct device *dev,
1890 struct device_attribute *attr, char *buf)
1891{
1892 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1893
1894 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1895}
1896
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001897static ssize_t rbd_name_show(struct device *dev,
1898 struct device_attribute *attr, char *buf)
1899{
Alex Elder593a9e72012-02-07 12:03:37 -06001900 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001901
Alex Elder0bed54d2012-07-03 16:01:18 -05001902 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001903}
1904
1905static ssize_t rbd_snap_show(struct device *dev,
1906 struct device_attribute *attr,
1907 char *buf)
1908{
Alex Elder593a9e72012-02-07 12:03:37 -06001909 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001910
1911 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1912}
1913
1914static ssize_t rbd_image_refresh(struct device *dev,
1915 struct device_attribute *attr,
1916 const char *buf,
1917 size_t size)
1918{
Alex Elder593a9e72012-02-07 12:03:37 -06001919 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001920 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001921
Alex Elder1fe5e992012-07-25 09:32:41 -05001922 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001923
1924 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001925}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001926
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001927static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1928static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1929static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1930static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001931static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001932static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1933static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1934static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1935static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001936
1937static struct attribute *rbd_attrs[] = {
1938 &dev_attr_size.attr,
1939 &dev_attr_major.attr,
1940 &dev_attr_client_id.attr,
1941 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001942 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001943 &dev_attr_name.attr,
1944 &dev_attr_current_snap.attr,
1945 &dev_attr_refresh.attr,
1946 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001947 NULL
1948};
1949
1950static struct attribute_group rbd_attr_group = {
1951 .attrs = rbd_attrs,
1952};
1953
1954static const struct attribute_group *rbd_attr_groups[] = {
1955 &rbd_attr_group,
1956 NULL
1957};
1958
1959static void rbd_sysfs_dev_release(struct device *dev)
1960{
1961}
1962
1963static struct device_type rbd_device_type = {
1964 .name = "rbd",
1965 .groups = rbd_attr_groups,
1966 .release = rbd_sysfs_dev_release,
1967};
1968
1969
1970/*
1971 sysfs - snapshots
1972*/
1973
1974static ssize_t rbd_snap_size_show(struct device *dev,
1975 struct device_attribute *attr,
1976 char *buf)
1977{
1978 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1979
Josh Durgin35915382011-12-05 18:25:13 -08001980 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001981}
1982
1983static ssize_t rbd_snap_id_show(struct device *dev,
1984 struct device_attribute *attr,
1985 char *buf)
1986{
1987 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1988
Josh Durgin35915382011-12-05 18:25:13 -08001989 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001990}
1991
1992static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1993static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1994
1995static struct attribute *rbd_snap_attrs[] = {
1996 &dev_attr_snap_size.attr,
1997 &dev_attr_snap_id.attr,
1998 NULL,
1999};
2000
2001static struct attribute_group rbd_snap_attr_group = {
2002 .attrs = rbd_snap_attrs,
2003};
2004
2005static void rbd_snap_dev_release(struct device *dev)
2006{
2007 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2008 kfree(snap->name);
2009 kfree(snap);
2010}
2011
2012static const struct attribute_group *rbd_snap_attr_groups[] = {
2013 &rbd_snap_attr_group,
2014 NULL
2015};
2016
2017static struct device_type rbd_snap_device_type = {
2018 .groups = rbd_snap_attr_groups,
2019 .release = rbd_snap_dev_release,
2020};
2021
Alex Elder14e70852012-07-19 09:09:27 -05002022static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002023{
2024 list_del(&snap->node);
2025 device_unregister(&snap->dev);
2026}
2027
Alex Elder14e70852012-07-19 09:09:27 -05002028static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002029 struct device *parent)
2030{
2031 struct device *dev = &snap->dev;
2032 int ret;
2033
2034 dev->type = &rbd_snap_device_type;
2035 dev->parent = parent;
2036 dev->release = rbd_snap_dev_release;
2037 dev_set_name(dev, "snap_%s", snap->name);
2038 ret = device_register(dev);
2039
2040 return ret;
2041}
2042
Alex Elder4e891e02012-07-10 20:30:10 -05002043static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2044 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002045{
Alex Elder4e891e02012-07-10 20:30:10 -05002046 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002047 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002048
2049 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002050 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002051 return ERR_PTR(-ENOMEM);
2052
2053 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002054 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002055 if (!snap->name)
2056 goto err;
2057
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002058 snap->size = rbd_dev->header.snap_sizes[i];
2059 snap->id = rbd_dev->header.snapc->snaps[i];
2060 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002061 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002062 if (ret < 0)
2063 goto err;
2064 }
Alex Elder4e891e02012-07-10 20:30:10 -05002065
2066 return snap;
2067
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002068err:
2069 kfree(snap->name);
2070 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002071
2072 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002073}
2074
2075/*
Alex Elder35938152012-08-02 11:29:46 -05002076 * Scan the rbd device's current snapshot list and compare it to the
2077 * newly-received snapshot context. Remove any existing snapshots
2078 * not present in the new snapshot context. Add a new snapshot for
2079 * any snaphots in the snapshot context not in the current list.
2080 * And verify there are no changes to snapshots we already know
2081 * about.
2082 *
2083 * Assumes the snapshots in the snapshot context are sorted by
2084 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2085 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002086 */
2087static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2088{
Alex Elder35938152012-08-02 11:29:46 -05002089 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2090 const u32 snap_count = snapc->num_snaps;
2091 char *snap_name = rbd_dev->header.snap_names;
2092 struct list_head *head = &rbd_dev->snaps;
2093 struct list_head *links = head->next;
2094 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002095
Alex Elder35938152012-08-02 11:29:46 -05002096 while (index < snap_count || links != head) {
2097 u64 snap_id;
2098 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002099
Alex Elder35938152012-08-02 11:29:46 -05002100 snap_id = index < snap_count ? snapc->snaps[index]
2101 : CEPH_NOSNAP;
2102 snap = links != head ? list_entry(links, struct rbd_snap, node)
2103 : NULL;
2104 BUG_ON(snap && snap->id == CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105
Alex Elder35938152012-08-02 11:29:46 -05002106 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2107 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002108
Alex Elder35938152012-08-02 11:29:46 -05002109 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002110
Alex Elder35938152012-08-02 11:29:46 -05002111 if (rbd_dev->snap_id == snap->id)
Josh Durgine88a36e2011-11-21 18:14:25 -08002112 rbd_dev->snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002113 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002114
Alex Elder35938152012-08-02 11:29:46 -05002115 /* Done with this list entry; advance */
2116
2117 links = next;
2118 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119 }
Alex Elder35938152012-08-02 11:29:46 -05002120
2121 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2122 struct rbd_snap *new_snap;
2123
2124 /* We haven't seen this snapshot before */
2125
2126 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2127 snap_name);
2128 if (IS_ERR(new_snap))
2129 return PTR_ERR(new_snap);
2130
2131 /* New goes before existing, or at end of list */
2132
2133 if (snap)
2134 list_add_tail(&new_snap->node, &snap->node);
2135 else
2136 list_add(&new_snap->node, head);
2137 } else {
2138 /* Already have this one */
2139
2140 BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]);
2141 BUG_ON(strcmp(snap->name, snap_name));
2142
2143 /* Done with this list entry; advance */
2144
2145 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002146 }
Alex Elder35938152012-08-02 11:29:46 -05002147
2148 /* Advance to the next entry in the snapshot context */
2149
2150 index++;
2151 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002152 }
2153
2154 return 0;
2155}
2156
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002157static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2158{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002159 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002160 struct device *dev;
2161 struct rbd_snap *snap;
2162
2163 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2164 dev = &rbd_dev->dev;
2165
2166 dev->bus = &rbd_bus_type;
2167 dev->type = &rbd_device_type;
2168 dev->parent = &rbd_root_dev;
2169 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002170 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002171 ret = device_register(dev);
2172 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002173 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002174
2175 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002176 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002177 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002178 break;
2179 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002180out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002181 mutex_unlock(&ctl_mutex);
2182 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002183}
2184
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002185static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2186{
2187 device_unregister(&rbd_dev->dev);
2188}
2189
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002190static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2191{
2192 int ret, rc;
2193
2194 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002195 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002196 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002197 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002198 if (rc < 0)
2199 return rc;
2200 }
2201 } while (ret == -ERANGE);
2202
2203 return ret;
2204}
2205
Alex Elder1ddbe942012-01-29 13:57:44 -06002206static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2207
2208/*
Alex Elder499afd52012-02-02 08:13:29 -06002209 * Get a unique rbd identifier for the given new rbd_dev, and add
2210 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002211 */
Alex Elder499afd52012-02-02 08:13:29 -06002212static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002213{
Alex Elderde71a292012-07-03 16:01:19 -05002214 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002215
2216 spin_lock(&rbd_dev_list_lock);
2217 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2218 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002219}
Alex Elderb7f23c32012-01-29 13:57:43 -06002220
Alex Elder1ddbe942012-01-29 13:57:44 -06002221/*
Alex Elder499afd52012-02-02 08:13:29 -06002222 * Remove an rbd_dev from the global list, and record that its
2223 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002224 */
Alex Elder499afd52012-02-02 08:13:29 -06002225static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002226{
Alex Elderd184f6b2012-01-29 13:57:44 -06002227 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002228 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002229 int max_id;
2230
2231 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002232
2233 spin_lock(&rbd_dev_list_lock);
2234 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002235
2236 /*
2237 * If the id being "put" is not the current maximum, there
2238 * is nothing special we need to do.
2239 */
2240 if (rbd_id != atomic64_read(&rbd_id_max)) {
2241 spin_unlock(&rbd_dev_list_lock);
2242 return;
2243 }
2244
2245 /*
2246 * We need to update the current maximum id. Search the
2247 * list to find out what it is. We're more likely to find
2248 * the maximum at the end, so search the list backward.
2249 */
2250 max_id = 0;
2251 list_for_each_prev(tmp, &rbd_dev_list) {
2252 struct rbd_device *rbd_dev;
2253
2254 rbd_dev = list_entry(tmp, struct rbd_device, node);
2255 if (rbd_id > max_id)
2256 max_id = rbd_id;
2257 }
Alex Elder499afd52012-02-02 08:13:29 -06002258 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002259
Alex Elder1ddbe942012-01-29 13:57:44 -06002260 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002261 * The max id could have been updated by rbd_id_get(), in
2262 * which case it now accurately reflects the new maximum.
2263 * Be careful not to overwrite the maximum value in that
2264 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002265 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002266 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002267}
2268
Alex Eldera725f65e2012-02-02 08:13:30 -06002269/*
Alex Eldere28fff262012-02-02 08:13:30 -06002270 * Skips over white space at *buf, and updates *buf to point to the
2271 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002272 * the token (string of non-white space characters) found. Note
2273 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002274 */
2275static inline size_t next_token(const char **buf)
2276{
2277 /*
2278 * These are the characters that produce nonzero for
2279 * isspace() in the "C" and "POSIX" locales.
2280 */
2281 const char *spaces = " \f\n\r\t\v";
2282
2283 *buf += strspn(*buf, spaces); /* Find start of token */
2284
2285 return strcspn(*buf, spaces); /* Return token length */
2286}
2287
2288/*
2289 * Finds the next token in *buf, and if the provided token buffer is
2290 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002291 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2292 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002293 *
2294 * Returns the length of the token found (not including the '\0').
2295 * Return value will be 0 if no token is found, and it will be >=
2296 * token_size if the token would not fit.
2297 *
Alex Elder593a9e72012-02-07 12:03:37 -06002298 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002299 * found token. Note that this occurs even if the token buffer is
2300 * too small to hold it.
2301 */
2302static inline size_t copy_token(const char **buf,
2303 char *token,
2304 size_t token_size)
2305{
2306 size_t len;
2307
2308 len = next_token(buf);
2309 if (len < token_size) {
2310 memcpy(token, *buf, len);
2311 *(token + len) = '\0';
2312 }
2313 *buf += len;
2314
2315 return len;
2316}
2317
2318/*
Alex Elderea3352f2012-07-09 21:04:23 -05002319 * Finds the next token in *buf, dynamically allocates a buffer big
2320 * enough to hold a copy of it, and copies the token into the new
2321 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2322 * that a duplicate buffer is created even for a zero-length token.
2323 *
2324 * Returns a pointer to the newly-allocated duplicate, or a null
2325 * pointer if memory for the duplicate was not available. If
2326 * the lenp argument is a non-null pointer, the length of the token
2327 * (not including the '\0') is returned in *lenp.
2328 *
2329 * If successful, the *buf pointer will be updated to point beyond
2330 * the end of the found token.
2331 *
2332 * Note: uses GFP_KERNEL for allocation.
2333 */
2334static inline char *dup_token(const char **buf, size_t *lenp)
2335{
2336 char *dup;
2337 size_t len;
2338
2339 len = next_token(buf);
2340 dup = kmalloc(len + 1, GFP_KERNEL);
2341 if (!dup)
2342 return NULL;
2343
2344 memcpy(dup, *buf, len);
2345 *(dup + len) = '\0';
2346 *buf += len;
2347
2348 if (lenp)
2349 *lenp = len;
2350
2351 return dup;
2352}
2353
2354/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002355 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002356 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2357 * on the list of monitor addresses and other options provided via
2358 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002359 *
2360 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002361 */
2362static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2363 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002364 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002365 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002366 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002367 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002368{
Alex Elderd22f76e2012-07-12 10:46:35 -05002369 size_t len;
2370 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002371
2372 /* The first four tokens are required */
2373
Alex Elder7ef32142012-02-02 08:13:30 -06002374 len = next_token(&buf);
2375 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002376 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002377 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002378 *mon_addrs = buf;
2379
2380 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002381
Alex Eldere28fff262012-02-02 08:13:30 -06002382 len = copy_token(&buf, options, options_size);
2383 if (!len || len >= options_size)
2384 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002385
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002386 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002387 rbd_dev->pool_name = dup_token(&buf, NULL);
2388 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002389 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002390
Alex Elder0bed54d2012-07-03 16:01:18 -05002391 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2392 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002393 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002394
Alex Eldercb8627c2012-07-09 21:04:23 -05002395 /* Create the name of the header object */
2396
Alex Elder0bed54d2012-07-03 16:01:18 -05002397 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002398 + sizeof (RBD_SUFFIX),
2399 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002400 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002401 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002402 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002403
Alex Eldere28fff262012-02-02 08:13:30 -06002404 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002405 * The snapshot name is optional. If none is is supplied,
2406 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002407 */
Alex Elder820a5f32012-07-09 21:04:24 -05002408 rbd_dev->snap_name = dup_token(&buf, &len);
2409 if (!rbd_dev->snap_name)
2410 goto out_err;
2411 if (!len) {
2412 /* Replace the empty name with the default */
2413 kfree(rbd_dev->snap_name);
2414 rbd_dev->snap_name
2415 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2416 if (!rbd_dev->snap_name)
2417 goto out_err;
2418
Alex Eldere28fff262012-02-02 08:13:30 -06002419 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2420 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002421 }
Alex Eldere28fff262012-02-02 08:13:30 -06002422
Alex Eldera725f65e2012-02-02 08:13:30 -06002423 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002424
2425out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002426 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002427 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002428 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002429 rbd_dev->image_name = NULL;
2430 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002431 kfree(rbd_dev->pool_name);
2432 rbd_dev->pool_name = NULL;
2433
2434 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002435}
2436
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002437static ssize_t rbd_add(struct bus_type *bus,
2438 const char *buf,
2439 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002440{
Alex Eldercb8627c2012-07-09 21:04:23 -05002441 char *options;
2442 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002443 const char *mon_addrs = NULL;
2444 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002445 struct ceph_osd_client *osdc;
2446 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002447
2448 if (!try_module_get(THIS_MODULE))
2449 return -ENODEV;
2450
Alex Elder27cc2592012-02-02 08:13:30 -06002451 options = kmalloc(count, GFP_KERNEL);
2452 if (!options)
2453 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002454 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2455 if (!rbd_dev)
2456 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002457
2458 /* static rbd_device initialization */
2459 spin_lock_init(&rbd_dev->lock);
2460 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002461 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002462 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002463
Alex Elderd184f6b2012-01-29 13:57:44 -06002464 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002465 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002466
Alex Eldera725f65e2012-02-02 08:13:30 -06002467 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002468 BUILD_BUG_ON(DEV_NAME_LEN
2469 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002470 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a822012-01-29 13:57:44 -06002471
Alex Eldera725f65e2012-02-02 08:13:30 -06002472 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002473 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002474 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002475 if (rc)
2476 goto err_put_id;
2477
Alex Elder5214ecc2012-02-02 08:13:30 -06002478 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2479 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002480 if (IS_ERR(rbd_dev->rbd_client)) {
2481 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002482 rbd_dev->rbd_client = NULL;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002483 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002484 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002485
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002486 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002487 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002488 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2489 if (rc < 0)
2490 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002491 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002492
2493 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002494 rc = register_blkdev(0, rbd_dev->name);
2495 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002496 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002497 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002498
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002499 rc = rbd_bus_add_dev(rbd_dev);
2500 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002501 goto err_out_blkdev;
2502
Alex Elder32eec682012-02-08 16:11:14 -06002503 /*
2504 * At this point cleanup in the event of an error is the job
2505 * of the sysfs code (initiated by rbd_bus_del_dev()).
2506 *
2507 * Set up and announce blkdev mapping.
2508 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002509 rc = rbd_init_disk(rbd_dev);
2510 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002511 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002512
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002513 rc = rbd_init_watch_dev(rbd_dev);
2514 if (rc)
2515 goto err_out_bus;
2516
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002517 return count;
2518
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002519err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002520 /* this will also clean up rest of rbd_dev stuff */
2521
2522 rbd_bus_del_dev(rbd_dev);
2523 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002524 return rc;
2525
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002526err_out_blkdev:
2527 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2528err_out_client:
2529 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002530err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002531 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002532 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002533 kfree(rbd_dev->header_name);
2534 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002535 kfree(rbd_dev->pool_name);
2536 }
Alex Elder499afd52012-02-02 08:13:29 -06002537 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002538err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002539 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002540 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002541
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002542 dout("Error adding device %s\n", buf);
2543 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002544
2545 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002546}
2547
Alex Elderde71a292012-07-03 16:01:19 -05002548static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002549{
2550 struct list_head *tmp;
2551 struct rbd_device *rbd_dev;
2552
Alex Eldere124a822012-01-29 13:57:44 -06002553 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002554 list_for_each(tmp, &rbd_dev_list) {
2555 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002556 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002557 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002558 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002559 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002560 }
Alex Eldere124a822012-01-29 13:57:44 -06002561 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002562 return NULL;
2563}
2564
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002565static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002566{
Alex Elder593a9e72012-02-07 12:03:37 -06002567 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002568
Alex Elder1dbb4392012-01-24 10:08:37 -06002569 if (rbd_dev->watch_request) {
2570 struct ceph_client *client = rbd_dev->rbd_client->client;
2571
2572 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002573 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002574 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002575 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002576 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002577
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002578 rbd_put_client(rbd_dev);
2579
2580 /* clean up and free blkdev */
2581 rbd_free_disk(rbd_dev);
2582 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002583
2584 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002585 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002586 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002587 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002588 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002589 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002590 kfree(rbd_dev);
2591
2592 /* release module ref */
2593 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002594}
2595
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002596static ssize_t rbd_remove(struct bus_type *bus,
2597 const char *buf,
2598 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002599{
2600 struct rbd_device *rbd_dev = NULL;
2601 int target_id, rc;
2602 unsigned long ul;
2603 int ret = count;
2604
2605 rc = strict_strtoul(buf, 10, &ul);
2606 if (rc)
2607 return rc;
2608
2609 /* convert to int; abort if we lost anything in the conversion */
2610 target_id = (int) ul;
2611 if (target_id != ul)
2612 return -EINVAL;
2613
2614 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2615
2616 rbd_dev = __rbd_get_dev(target_id);
2617 if (!rbd_dev) {
2618 ret = -ENOENT;
2619 goto done;
2620 }
2621
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002622 __rbd_remove_all_snaps(rbd_dev);
2623 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002624
2625done:
2626 mutex_unlock(&ctl_mutex);
2627 return ret;
2628}
2629
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002630static ssize_t rbd_snap_add(struct device *dev,
2631 struct device_attribute *attr,
2632 const char *buf,
2633 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002634{
Alex Elder593a9e72012-02-07 12:03:37 -06002635 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002636 int ret;
2637 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002638 if (!name)
2639 return -ENOMEM;
2640
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002641 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002642
2643 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2644
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002645 ret = rbd_header_add_snap(rbd_dev,
2646 name, GFP_KERNEL);
2647 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002648 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002649
Alex Elderb8136232012-07-25 09:32:41 -05002650 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002651 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002652 goto err_unlock;
2653
2654 /* shouldn't hold ctl_mutex when notifying.. notify might
2655 trigger a watch callback that would need to get that mutex */
2656 mutex_unlock(&ctl_mutex);
2657
2658 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002659 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002660
2661 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002662 kfree(name);
2663 return ret;
2664
2665err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002667 kfree(name);
2668 return ret;
2669}
2670
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002671/*
2672 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002673 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002674 */
2675static int rbd_sysfs_init(void)
2676{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002677 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002678
Alex Elderfed4c142012-02-07 12:03:36 -06002679 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002680 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002681 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002682
Alex Elderfed4c142012-02-07 12:03:36 -06002683 ret = bus_register(&rbd_bus_type);
2684 if (ret < 0)
2685 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002686
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002687 return ret;
2688}
2689
2690static void rbd_sysfs_cleanup(void)
2691{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002692 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002693 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002694}
2695
2696int __init rbd_init(void)
2697{
2698 int rc;
2699
2700 rc = rbd_sysfs_init();
2701 if (rc)
2702 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002703 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002704 return 0;
2705}
2706
2707void __exit rbd_exit(void)
2708{
2709 rbd_sysfs_cleanup();
2710}
2711
2712module_init(rbd_init);
2713module_exit(rbd_exit);
2714
2715MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2716MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2717MODULE_DESCRIPTION("rados block device");
2718
2719/* following authorship retained from original osdblk.c */
2720MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2721
2722MODULE_LICENSE("GPL");