blob: aff4e8a01ea522b43906c7a043ff05d9df69cba9 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
Alex Elder0f1d3f92012-08-02 11:29:44 -050084 u64 snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070085 u32 total_snaps;
86
87 char *snap_names;
88 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070089
90 u64 obj_version;
91};
92
93struct rbd_options {
94 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095};
96
97/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060098 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099 */
100struct rbd_client {
101 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700102 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103 struct kref kref;
104 struct list_head node;
105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700110struct rbd_req_status {
111 int done;
112 int rc;
113 u64 bytes;
114};
115
116/*
117 * a collection of requests
118 */
119struct rbd_req_coll {
120 int total;
121 int num_done;
122 struct kref kref;
123 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700124};
125
Alex Elderf0f8cef2012-01-29 13:57:44 -0600126/*
127 * a single io request
128 */
129struct rbd_request {
130 struct request *rq; /* blk layer request */
131 struct bio *bio; /* cloned bio */
132 struct page **pages; /* list of used pages */
133 u64 len;
134 int coll_index;
135 struct rbd_req_coll *coll;
136};
137
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800138struct rbd_snap {
139 struct device dev;
140 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800141 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800142 struct list_head node;
143 u64 id;
144};
145
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146/*
147 * a single device
148 */
149struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500150 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151
152 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156 struct rbd_client *rbd_client;
157
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
159
160 spinlock_t lock; /* queue lock */
161
162 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500163 char *image_name;
164 size_t image_name_len;
165 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500166 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500167 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700168
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700169 struct ceph_osd_event *watch_event;
170 struct ceph_osd_request *watch_request;
171
Josh Durginc6666012011-11-21 17:11:12 -0800172 /* protects updating the header */
173 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800174 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800176 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800177 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
182 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183
184 /* list of snapshots */
185 struct list_head snaps;
186
187 /* sysfs related */
188 struct device dev;
189};
190
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700191static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600192
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700193static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600194static DEFINE_SPINLOCK(rbd_dev_list_lock);
195
Alex Elder432b8582012-01-29 13:57:44 -0600196static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500205static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800206
Alex Elderf0f8cef2012-01-29 13:57:44 -0600207static ssize_t rbd_add(struct bus_type *bus, const char *buf,
208 size_t count);
209static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
210 size_t count);
211
212static struct bus_attribute rbd_bus_attrs[] = {
213 __ATTR(add, S_IWUSR, NULL, rbd_add),
214 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
215 __ATTR_NULL
216};
217
218static struct bus_type rbd_bus_type = {
219 .name = "rbd",
220 .bus_attrs = rbd_bus_attrs,
221};
222
223static void rbd_root_dev_release(struct device *dev)
224{
225}
226
227static struct device rbd_root_dev = {
228 .init_name = "rbd",
229 .release = rbd_root_dev_release,
230};
231
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
234{
235 return get_device(&rbd_dev->dev);
236}
237
238static void rbd_put_dev(struct rbd_device *rbd_dev)
239{
240 put_device(&rbd_dev->dev);
241}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700242
Alex Elder1fe5e992012-07-25 09:32:41 -0500243static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700244
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700245static int rbd_open(struct block_device *bdev, fmode_t mode)
246{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
250 return -EROFS;
251
Alex Elder340c7a22012-08-10 13:12:07 -0700252 rbd_get_dev(rbd_dev);
253 set_device_ro(bdev, rbd_dev->read_only);
254
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255 return 0;
256}
257
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258static int rbd_release(struct gendisk *disk, fmode_t mode)
259{
260 struct rbd_device *rbd_dev = disk->private_data;
261
262 rbd_put_dev(rbd_dev);
263
264 return 0;
265}
266
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267static const struct block_device_operations rbd_bd_ops = {
268 .owner = THIS_MODULE,
269 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800270 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700271};
272
273/*
274 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500275 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700276 */
Alex Elder43ae4702012-07-03 16:01:18 -0500277static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700278 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279{
280 struct rbd_client *rbdc;
281 int ret = -ENOMEM;
282
283 dout("rbd_client_create\n");
284 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
285 if (!rbdc)
286 goto out_opt;
287
288 kref_init(&rbdc->kref);
289 INIT_LIST_HEAD(&rbdc->node);
290
Alex Elderbc534d82012-01-29 13:57:44 -0600291 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
292
Alex Elder43ae4702012-07-03 16:01:18 -0500293 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600295 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500296 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
298 ret = ceph_open_session(rbdc->client);
299 if (ret < 0)
300 goto out_err;
301
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700302 rbdc->rbd_opts = rbd_opts;
303
Alex Elder432b8582012-01-29 13:57:44 -0600304 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600306 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307
Alex Elderbc534d82012-01-29 13:57:44 -0600308 mutex_unlock(&ctl_mutex);
309
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310 dout("rbd_client_create created %p\n", rbdc);
311 return rbdc;
312
313out_err:
314 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600315out_mutex:
316 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317 kfree(rbdc);
318out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500319 if (ceph_opts)
320 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400321 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322}
323
324/*
325 * Find a ceph client with specific addr and configuration.
326 */
Alex Elder43ae4702012-07-03 16:01:18 -0500327static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328{
329 struct rbd_client *client_node;
330
Alex Elder43ae4702012-07-03 16:01:18 -0500331 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332 return NULL;
333
334 list_for_each_entry(client_node, &rbd_client_list, node)
Alex Elder43ae4702012-07-03 16:01:18 -0500335 if (!ceph_compare_options(ceph_opts, client_node->client))
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336 return client_node;
337 return NULL;
338}
339
340/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700341 * mount options
342 */
343enum {
344 Opt_notify_timeout,
345 Opt_last_int,
346 /* int args above */
347 Opt_last_string,
348 /* string args above */
349};
350
Alex Elder43ae4702012-07-03 16:01:18 -0500351static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700352 {Opt_notify_timeout, "notify_timeout=%d"},
353 /* int args above */
354 /* string args above */
355 {-1, NULL}
356};
357
358static int parse_rbd_opts_token(char *c, void *private)
359{
Alex Elder43ae4702012-07-03 16:01:18 -0500360 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361 substring_t argstr[MAX_OPT_ARGS];
362 int token, intval, ret;
363
Alex Elder43ae4702012-07-03 16:01:18 -0500364 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700365 if (token < 0)
366 return -EINVAL;
367
368 if (token < Opt_last_int) {
369 ret = match_int(&argstr[0], &intval);
370 if (ret < 0) {
371 pr_err("bad mount option arg (not int) "
372 "at '%s'\n", c);
373 return ret;
374 }
375 dout("got int token %d val %d\n", token, intval);
376 } else if (token > Opt_last_int && token < Opt_last_string) {
377 dout("got string token %d val %s\n", token,
378 argstr[0].from);
379 } else {
380 dout("got token %d\n", token);
381 }
382
383 switch (token) {
384 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500385 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700386 break;
387 default:
388 BUG_ON(token);
389 }
390 return 0;
391}
392
393/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394 * Get a ceph client with specific addr and configuration, if one does
395 * not exist create it.
396 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600397static struct rbd_client *rbd_get_client(const char *mon_addr,
398 size_t mon_addr_len,
399 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700400{
401 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500402 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700403 struct rbd_options *rbd_opts;
404
405 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
406 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600407 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408
409 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410
Alex Elder43ae4702012-07-03 16:01:18 -0500411 ceph_opts = ceph_parse_options(options, mon_addr,
412 mon_addr + mon_addr_len,
413 parse_rbd_opts_token, rbd_opts);
414 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600415 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500416 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600417 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700418
Alex Elder432b8582012-01-29 13:57:44 -0600419 spin_lock(&rbd_client_list_lock);
Alex Elder43ae4702012-07-03 16:01:18 -0500420 rbdc = __rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600422 /* using an existing client */
423 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600424 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d3d2012-01-29 13:57:44 -0600425
Alex Elder43ae4702012-07-03 16:01:18 -0500426 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600427 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700428
Alex Elderd720bcb2012-02-02 08:13:30 -0600429 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700430 }
Alex Elder432b8582012-01-29 13:57:44 -0600431 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432
Alex Elder43ae4702012-07-03 16:01:18 -0500433 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600434
Alex Elderd720bcb2012-02-02 08:13:30 -0600435 if (IS_ERR(rbdc))
436 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437
Alex Elderd720bcb2012-02-02 08:13:30 -0600438 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439}
440
441/*
442 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600443 *
Alex Elder432b8582012-01-29 13:57:44 -0600444 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445 */
446static void rbd_client_release(struct kref *kref)
447{
448 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
449
450 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500451 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500453 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700454
455 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700456 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 kfree(rbdc);
458}
459
460/*
461 * Drop reference to ceph client node. If it's not referenced anymore, release
462 * it.
463 */
464static void rbd_put_client(struct rbd_device *rbd_dev)
465{
466 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
467 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468}
469
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700470/*
471 * Destroy requests collection
472 */
473static void rbd_coll_release(struct kref *kref)
474{
475 struct rbd_req_coll *coll =
476 container_of(kref, struct rbd_req_coll, kref);
477
478 dout("rbd_coll_release %p\n", coll);
479 kfree(coll);
480}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481
Alex Elder8e94af82012-07-25 09:32:40 -0500482static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
483{
484 return !memcmp(&ondisk->text,
485 RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT));
486}
487
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488/*
489 * Create a new header structure, translate header format from the on-disk
490 * header.
491 */
492static int rbd_header_from_disk(struct rbd_image_header *header,
493 struct rbd_image_header_ondisk *ondisk,
Alex Eldered63f4f2012-07-19 09:09:27 -0500494 u32 allocated_snaps)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495{
Alex Elderccece232012-07-10 20:30:10 -0500496 u32 snap_count;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500497 size_t size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700498
Alex Elder8e94af82012-07-25 09:32:40 -0500499 if (!rbd_dev_ondisk_valid(ondisk))
Josh Durgin81e759f2011-11-15 14:49:53 -0800500 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800501
Alex Elder00f1f362012-02-07 12:03:36 -0600502 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderd2bb24e2012-07-26 23:37:14 -0500503
504 /* Make sure we don't overflow below */
505 size = SIZE_MAX - sizeof (struct ceph_snap_context);
506 if (snap_count > size / sizeof (header->snapc->snaps[0]))
Xi Wang50f7c4c2012-04-20 15:49:44 -0500507 return -EINVAL;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500508
Alex Elder6a523252012-07-19 17:12:59 -0500509 memset(header, 0, sizeof (*header));
510
511 size = sizeof (ondisk->block_name) + 1;
512 header->object_prefix = kmalloc(size, GFP_KERNEL);
513 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514 return -ENOMEM;
Alex Elder6a523252012-07-19 17:12:59 -0500515 memcpy(header->object_prefix, ondisk->block_name, size - 1);
516 header->object_prefix[size - 1] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600517
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700518 if (snap_count) {
Alex Elderccece232012-07-10 20:30:10 -0500519 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Alex Elder0f1d3f92012-08-02 11:29:44 -0500520 BUG_ON(header->snap_names_len > (u64) SIZE_MAX);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700521 header->snap_names = kmalloc(header->snap_names_len,
Alex Eldered63f4f2012-07-19 09:09:27 -0500522 GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700523 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500524 goto out_err;
525
Alex Elderd2bb24e2012-07-26 23:37:14 -0500526 size = snap_count * sizeof (*header->snap_sizes);
527 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700528 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500529 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700530 } else {
Alex Elderccece232012-07-10 20:30:10 -0500531 WARN_ON(ondisk->snap_names_len);
532 header->snap_names_len = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700533 header->snap_names = NULL;
534 header->snap_sizes = NULL;
535 }
Alex Elder849b4262012-07-09 21:04:24 -0500536
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700537 header->image_size = le64_to_cpu(ondisk->image_size);
538 header->obj_order = ondisk->options.order;
539 header->crypt_type = ondisk->options.crypt_type;
540 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500541 header->total_snaps = snap_count;
542
Alex Elder28cb7752012-07-26 23:37:15 -0500543 /*
544 * If the number of snapshot ids provided by the caller
545 * doesn't match the number in the entire context there's
546 * no point in going further. Caller will try again after
547 * getting an updated snapshot context from the server.
548 */
549 if (allocated_snaps != snap_count)
550 return 0;
Alex Elder6a523252012-07-19 17:12:59 -0500551
552 size = sizeof (struct ceph_snap_context);
553 size += snap_count * sizeof (header->snapc->snaps[0]);
554 header->snapc = kzalloc(size, GFP_KERNEL);
555 if (!header->snapc)
556 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557
558 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500559 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700560 header->snapc->num_snaps = snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700561
Alex Elder28cb7752012-07-26 23:37:15 -0500562 /* Fill in the snapshot information */
563
564 if (snap_count) {
565 u32 i;
Alex Elderccece232012-07-10 20:30:10 -0500566
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700567 for (i = 0; i < snap_count; i++) {
568 header->snapc->snaps[i] =
569 le64_to_cpu(ondisk->snaps[i].id);
570 header->snap_sizes[i] =
571 le64_to_cpu(ondisk->snaps[i].image_size);
572 }
573
574 /* copy snapshot names */
Alex Elderccece232012-07-10 20:30:10 -0500575 memcpy(header->snap_names, &ondisk->snaps[snap_count],
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700576 header->snap_names_len);
577 }
578
579 return 0;
580
Alex Elder6a523252012-07-19 17:12:59 -0500581out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500582 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500583 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500585 header->snap_names = NULL;
Alex Elderd78fd7a2012-07-26 23:37:14 -0500586 header->snap_names_len = 0;
Alex Elder6a523252012-07-19 17:12:59 -0500587 kfree(header->object_prefix);
588 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500589
Alex Elder00f1f362012-02-07 12:03:36 -0600590 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700591}
592
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700593static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
594 u64 *seq, u64 *size)
595{
596 int i;
597 char *p = header->snap_names;
598
Alex Elder00f1f362012-02-07 12:03:36 -0600599 for (i = 0; i < header->total_snaps; i++) {
600 if (!strcmp(snap_name, p)) {
601
602 /* Found it. Pass back its id and/or size */
603
604 if (seq)
605 *seq = header->snapc->snaps[i];
606 if (size)
607 *size = header->snap_sizes[i];
608 return i;
609 }
610 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700611 }
Alex Elder00f1f362012-02-07 12:03:36 -0600612 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700613}
614
Alex Elder0ce1a792012-07-03 16:01:18 -0500615static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616{
Alex Elder78dc4472012-07-19 08:49:18 -0500617 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700618
Alex Elder0ce1a792012-07-03 16:01:18 -0500619 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700620
Alex Elder0ce1a792012-07-03 16:01:18 -0500621 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800622 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500623 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800624 rbd_dev->snap_exists = false;
Alex Elder0ce1a792012-07-03 16:01:18 -0500625 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500627 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700628 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500629 u64 snap_id = 0;
630
631 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
632 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700633 if (ret < 0)
634 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500635 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800636 rbd_dev->snap_exists = true;
Alex Elder0ce1a792012-07-03 16:01:18 -0500637 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638 }
639
640 ret = 0;
641done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500642 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643 return ret;
644}
645
646static void rbd_header_free(struct rbd_image_header *header)
647{
Alex Elder849b4262012-07-09 21:04:24 -0500648 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500649 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500651 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500652 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500653 header->snap_names = NULL;
654 header->snap_names_len = 0;
Josh Durgind1d25642011-12-05 14:03:05 -0800655 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500656 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657}
658
659/*
660 * get the actual striped segment name, offset and length
661 */
662static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500663 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700664 u64 ofs, u64 len,
665 char *seg_name, u64 *segofs)
666{
667 u64 seg = ofs >> header->obj_order;
668
669 if (seg_name)
670 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500671 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700672
673 ofs = ofs & ((1 << header->obj_order) - 1);
674 len = min_t(u64, len, (1 << header->obj_order) - ofs);
675
676 if (segofs)
677 *segofs = ofs;
678
679 return len;
680}
681
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700682static int rbd_get_num_segments(struct rbd_image_header *header,
683 u64 ofs, u64 len)
684{
685 u64 start_seg = ofs >> header->obj_order;
686 u64 end_seg = (ofs + len - 1) >> header->obj_order;
687 return end_seg - start_seg + 1;
688}
689
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700691 * returns the size of an object in the image
692 */
693static u64 rbd_obj_bytes(struct rbd_image_header *header)
694{
695 return 1 << header->obj_order;
696}
697
698/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 * bio helpers
700 */
701
702static void bio_chain_put(struct bio *chain)
703{
704 struct bio *tmp;
705
706 while (chain) {
707 tmp = chain;
708 chain = chain->bi_next;
709 bio_put(tmp);
710 }
711}
712
713/*
714 * zeros a bio chain, starting at specific offset
715 */
716static void zero_bio_chain(struct bio *chain, int start_ofs)
717{
718 struct bio_vec *bv;
719 unsigned long flags;
720 void *buf;
721 int i;
722 int pos = 0;
723
724 while (chain) {
725 bio_for_each_segment(bv, chain, i) {
726 if (pos + bv->bv_len > start_ofs) {
727 int remainder = max(start_ofs - pos, 0);
728 buf = bvec_kmap_irq(bv, &flags);
729 memset(buf + remainder, 0,
730 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200731 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700732 }
733 pos += bv->bv_len;
734 }
735
736 chain = chain->bi_next;
737 }
738}
739
740/*
741 * bio_chain_clone - clone a chain of bios up to a certain length.
742 * might return a bio_pair that will need to be released.
743 */
744static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
745 struct bio_pair **bp,
746 int len, gfp_t gfpmask)
747{
748 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
749 int total = 0;
750
751 if (*bp) {
752 bio_pair_release(*bp);
753 *bp = NULL;
754 }
755
756 while (old_chain && (total < len)) {
757 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
758 if (!tmp)
759 goto err_out;
760
761 if (total + old_chain->bi_size > len) {
762 struct bio_pair *bp;
763
764 /*
765 * this split can only happen with a single paged bio,
766 * split_bio will BUG_ON if this is not the case
767 */
768 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500769 "bi_size=%u\n",
770 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700771
772 /* split the bio. We'll release it either in the next
773 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600774 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700775 if (!bp)
776 goto err_out;
777
778 __bio_clone(tmp, &bp->bio1);
779
780 *next = &bp->bio2;
781 } else {
782 __bio_clone(tmp, old_chain);
783 *next = old_chain->bi_next;
784 }
785
786 tmp->bi_bdev = NULL;
787 gfpmask &= ~__GFP_WAIT;
788 tmp->bi_next = NULL;
789
790 if (!new_chain) {
791 new_chain = tail = tmp;
792 } else {
793 tail->bi_next = tmp;
794 tail = tmp;
795 }
796 old_chain = old_chain->bi_next;
797
798 total += tmp->bi_size;
799 }
800
801 BUG_ON(total < len);
802
803 if (tail)
804 tail->bi_next = NULL;
805
806 *old = old_chain;
807
808 return new_chain;
809
810err_out:
811 dout("bio_chain_clone with err\n");
812 bio_chain_put(new_chain);
813 return NULL;
814}
815
816/*
817 * helpers for osd request op vectors.
818 */
Alex Elder57cfc102012-06-26 12:57:03 -0700819static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
820 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700821{
Alex Elder57cfc102012-06-26 12:57:03 -0700822 struct ceph_osd_req_op *ops;
823
824 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
825 if (!ops)
826 return NULL;
827
828 ops[0].op = opcode;
829
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700830 /*
831 * op extent offset and length will be set later on
832 * in calc_raw_layout()
833 */
Alex Elder57cfc102012-06-26 12:57:03 -0700834 ops[0].payload_len = payload_len;
835
836 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700837}
838
839static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
840{
841 kfree(ops);
842}
843
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700844static void rbd_coll_end_req_index(struct request *rq,
845 struct rbd_req_coll *coll,
846 int index,
847 int ret, u64 len)
848{
849 struct request_queue *q;
850 int min, max, i;
851
Alex Elderbd919d42012-07-13 20:35:11 -0500852 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
853 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700854
855 if (!rq)
856 return;
857
858 if (!coll) {
859 blk_end_request(rq, ret, len);
860 return;
861 }
862
863 q = rq->q;
864
865 spin_lock_irq(q->queue_lock);
866 coll->status[index].done = 1;
867 coll->status[index].rc = ret;
868 coll->status[index].bytes = len;
869 max = min = coll->num_done;
870 while (max < coll->total && coll->status[max].done)
871 max++;
872
873 for (i = min; i<max; i++) {
874 __blk_end_request(rq, coll->status[i].rc,
875 coll->status[i].bytes);
876 coll->num_done++;
877 kref_put(&coll->kref, rbd_coll_release);
878 }
879 spin_unlock_irq(q->queue_lock);
880}
881
882static void rbd_coll_end_req(struct rbd_request *req,
883 int ret, u64 len)
884{
885 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
886}
887
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888/*
889 * Send ceph osd request
890 */
891static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500892 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700893 struct ceph_snap_context *snapc,
894 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500895 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896 struct bio *bio,
897 struct page **pages,
898 int num_pages,
899 int flags,
900 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700901 struct rbd_req_coll *coll,
902 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700904 struct ceph_msg *msg),
905 struct ceph_osd_request **linger_req,
906 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700907{
908 struct ceph_osd_request *req;
909 struct ceph_file_layout *layout;
910 int ret;
911 u64 bno;
912 struct timespec mtime = CURRENT_TIME;
913 struct rbd_request *req_data;
914 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600915 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700916
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700917 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700918 if (!req_data) {
919 if (coll)
920 rbd_coll_end_req_index(rq, coll, coll_index,
921 -ENOMEM, len);
922 return -ENOMEM;
923 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700924
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700925 if (coll) {
926 req_data->coll = coll;
927 req_data->coll_index = coll_index;
928 }
929
Alex Elderbd919d42012-07-13 20:35:11 -0500930 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
931 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700932
Alex Elder0ce1a792012-07-03 16:01:18 -0500933 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600934 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
935 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700936 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700937 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700938 goto done_pages;
939 }
940
941 req->r_callback = rbd_cb;
942
943 req_data->rq = rq;
944 req_data->bio = bio;
945 req_data->pages = pages;
946 req_data->len = len;
947
948 req->r_priv = req_data;
949
950 reqhead = req->r_request->front.iov_base;
951 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
952
Alex Elderaded07e2012-07-03 16:01:18 -0500953 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700954 req->r_oid_len = strlen(req->r_oid);
955
956 layout = &req->r_file_layout;
957 memset(layout, 0, sizeof(*layout));
958 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
959 layout->fl_stripe_count = cpu_to_le32(1);
960 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500961 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600962 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
963 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700964
965 ceph_osdc_build_request(req, ofs, &len,
966 ops,
967 snapc,
968 &mtime,
969 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700970
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700971 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600972 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700973 *linger_req = req;
974 }
975
Alex Elder1dbb4392012-01-24 10:08:37 -0600976 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700977 if (ret < 0)
978 goto done_err;
979
980 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600981 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700982 if (ver)
983 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -0500984 dout("reassert_ver=%llu\n",
985 (unsigned long long)
986 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700987 ceph_osdc_put_request(req);
988 }
989 return ret;
990
991done_err:
992 bio_chain_put(req_data->bio);
993 ceph_osdc_put_request(req);
994done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700995 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700996 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700997 return ret;
998}
999
1000/*
1001 * Ceph osd op callback
1002 */
1003static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1004{
1005 struct rbd_request *req_data = req->r_priv;
1006 struct ceph_osd_reply_head *replyhead;
1007 struct ceph_osd_op *op;
1008 __s32 rc;
1009 u64 bytes;
1010 int read_op;
1011
1012 /* parse reply */
1013 replyhead = msg->front.iov_base;
1014 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1015 op = (void *)(replyhead + 1);
1016 rc = le32_to_cpu(replyhead->result);
1017 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001018 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001019
Alex Elderbd919d42012-07-13 20:35:11 -05001020 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1021 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001022
1023 if (rc == -ENOENT && read_op) {
1024 zero_bio_chain(req_data->bio, 0);
1025 rc = 0;
1026 } else if (rc == 0 && read_op && bytes < req_data->len) {
1027 zero_bio_chain(req_data->bio, bytes);
1028 bytes = req_data->len;
1029 }
1030
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001031 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001032
1033 if (req_data->bio)
1034 bio_chain_put(req_data->bio);
1035
1036 ceph_osdc_put_request(req);
1037 kfree(req_data);
1038}
1039
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001040static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1041{
1042 ceph_osdc_put_request(req);
1043}
1044
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001045/*
1046 * Do a synchronous ceph osd operation
1047 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001048static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001049 struct ceph_snap_context *snapc,
1050 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001051 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001052 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001053 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001055 char *buf,
1056 struct ceph_osd_request **linger_req,
1057 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058{
1059 int ret;
1060 struct page **pages;
1061 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001062
1063 BUG_ON(ops == NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001064
1065 num_pages = calc_pages_for(ofs , len);
1066 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001067 if (IS_ERR(pages))
1068 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001069
Alex Elder0ce1a792012-07-03 16:01:18 -05001070 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001071 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001072 pages, num_pages,
1073 flags,
1074 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001075 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001076 NULL,
1077 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001078 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001079 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080
1081 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1082 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1083
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084done:
1085 ceph_release_page_vector(pages, num_pages);
1086 return ret;
1087}
1088
1089/*
1090 * Do an asynchronous ceph osd operation
1091 */
1092static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001093 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001094 struct ceph_snap_context *snapc,
1095 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001096 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001097 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001098 struct bio *bio,
1099 struct rbd_req_coll *coll,
1100 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001101{
1102 char *seg_name;
1103 u64 seg_ofs;
1104 u64 seg_len;
1105 int ret;
1106 struct ceph_osd_req_op *ops;
1107 u32 payload_len;
1108
1109 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1110 if (!seg_name)
1111 return -ENOMEM;
1112
1113 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001114 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001115 ofs, len,
1116 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117
1118 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1119
Alex Elder57cfc102012-06-26 12:57:03 -07001120 ret = -ENOMEM;
1121 ops = rbd_create_rw_ops(1, opcode, payload_len);
1122 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123 goto done;
1124
1125 /* we've taken care of segment sizes earlier when we
1126 cloned the bios. We should never have a segment
1127 truncated at this point */
1128 BUG_ON(seg_len < len);
1129
1130 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1131 seg_name, seg_ofs, seg_len,
1132 bio,
1133 NULL, 0,
1134 flags,
1135 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001136 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001137 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001138
1139 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001140done:
1141 kfree(seg_name);
1142 return ret;
1143}
1144
1145/*
1146 * Request async osd write
1147 */
1148static int rbd_req_write(struct request *rq,
1149 struct rbd_device *rbd_dev,
1150 struct ceph_snap_context *snapc,
1151 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001152 struct bio *bio,
1153 struct rbd_req_coll *coll,
1154 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001155{
1156 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1157 CEPH_OSD_OP_WRITE,
1158 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001159 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001160}
1161
1162/*
1163 * Request async osd read
1164 */
1165static int rbd_req_read(struct request *rq,
1166 struct rbd_device *rbd_dev,
1167 u64 snapid,
1168 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001169 struct bio *bio,
1170 struct rbd_req_coll *coll,
1171 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172{
1173 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001174 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001175 CEPH_OSD_OP_READ,
1176 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001177 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178}
1179
1180/*
1181 * Request sync osd read
1182 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001183static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001184 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001185 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001186 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001187 char *buf,
1188 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001189{
Alex Elder913d2fd2012-06-26 12:57:03 -07001190 struct ceph_osd_req_op *ops;
1191 int ret;
1192
1193 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1194 if (!ops)
1195 return -ENOMEM;
1196
1197 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001198 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001199 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001200 ops, object_name, ofs, len, buf, NULL, ver);
1201 rbd_destroy_ops(ops);
1202
1203 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001204}
1205
1206/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001207 * Request sync osd watch
1208 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001209static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001210 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001211 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001212{
1213 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001214 int ret;
1215
Alex Elder57cfc102012-06-26 12:57:03 -07001216 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1217 if (!ops)
1218 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001219
Josh Durgina71b8912011-12-05 18:10:44 -08001220 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001221 ops[0].watch.cookie = notify_id;
1222 ops[0].watch.flag = 0;
1223
Alex Elder0ce1a792012-07-03 16:01:18 -05001224 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001225 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001226 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001227 CEPH_OSD_FLAG_READ,
1228 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001229 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 rbd_simple_req_cb, 0, NULL);
1231
1232 rbd_destroy_ops(ops);
1233 return ret;
1234}
1235
1236static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1237{
Alex Elder0ce1a792012-07-03 16:01:18 -05001238 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001239 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001240 int rc;
1241
Alex Elder0ce1a792012-07-03 16:01:18 -05001242 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001243 return;
1244
Alex Elderbd919d42012-07-13 20:35:11 -05001245 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1246 rbd_dev->header_name, (unsigned long long) notify_id,
1247 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001248 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001249 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001250 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001251 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001252
Alex Elder7f0a24d2012-07-25 09:32:40 -05001253 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001254}
1255
1256/*
1257 * Request sync osd watch
1258 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001259static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260{
1261 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001262 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001263 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001264
Alex Elder57cfc102012-06-26 12:57:03 -07001265 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1266 if (!ops)
1267 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001268
1269 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001270 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001271 if (ret < 0)
1272 goto fail;
1273
Alex Elder0e6f3222012-07-25 09:32:40 -05001274 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001275 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001276 ops[0].watch.flag = 1;
1277
Alex Elder0ce1a792012-07-03 16:01:18 -05001278 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001279 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001280 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1281 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001282 rbd_dev->header_name,
1283 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001284 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001285
1286 if (ret < 0)
1287 goto fail_event;
1288
1289 rbd_destroy_ops(ops);
1290 return 0;
1291
1292fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001293 ceph_osdc_cancel_event(rbd_dev->watch_event);
1294 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001295fail:
1296 rbd_destroy_ops(ops);
1297 return ret;
1298}
1299
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001300/*
1301 * Request sync osd unwatch
1302 */
Alex Elder070c6332012-07-25 09:32:41 -05001303static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001304{
1305 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001306 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001307
Alex Elder57cfc102012-06-26 12:57:03 -07001308 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1309 if (!ops)
1310 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001311
1312 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001313 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001314 ops[0].watch.flag = 0;
1315
Alex Elder0ce1a792012-07-03 16:01:18 -05001316 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001317 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001318 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1319 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001320 rbd_dev->header_name,
1321 0, 0, NULL, NULL, NULL);
1322
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001323
1324 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001325 ceph_osdc_cancel_event(rbd_dev->watch_event);
1326 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001327 return ret;
1328}
1329
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001331 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001332};
1333
1334static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1335{
Alex Elder0ce1a792012-07-03 16:01:18 -05001336 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1337 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338 return;
1339
Alex Elderbd919d42012-07-13 20:35:11 -05001340 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1341 rbd_dev->header_name, (unsigned long long) notify_id,
1342 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343}
1344
1345/*
1346 * Request sync osd notify
1347 */
Alex Elder4cb16252012-07-25 09:32:40 -05001348static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001349{
1350 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001351 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001352 struct ceph_osd_event *event;
1353 struct rbd_notify_info info;
1354 int payload_len = sizeof(u32) + sizeof(u32);
1355 int ret;
1356
Alex Elder57cfc102012-06-26 12:57:03 -07001357 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1358 if (!ops)
1359 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001360
Alex Elder0ce1a792012-07-03 16:01:18 -05001361 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001362
1363 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1364 (void *)&info, &event);
1365 if (ret < 0)
1366 goto fail;
1367
1368 ops[0].watch.ver = 1;
1369 ops[0].watch.flag = 1;
1370 ops[0].watch.cookie = event->cookie;
1371 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1372 ops[0].watch.timeout = 12;
1373
Alex Elder0ce1a792012-07-03 16:01:18 -05001374 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001375 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001376 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1377 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001378 rbd_dev->header_name,
1379 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001380 if (ret < 0)
1381 goto fail_event;
1382
1383 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1384 dout("ceph_osdc_wait_event returned %d\n", ret);
1385 rbd_destroy_ops(ops);
1386 return 0;
1387
1388fail_event:
1389 ceph_osdc_cancel_event(event);
1390fail:
1391 rbd_destroy_ops(ops);
1392 return ret;
1393}
1394
1395/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001396 * Request sync osd read
1397 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001398static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001399 const char *object_name,
1400 const char *class_name,
1401 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001402 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001403 int len,
1404 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001405{
1406 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001407 int class_name_len = strlen(class_name);
1408 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001409 int ret;
1410
1411 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001412 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001413 if (!ops)
1414 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001415
Alex Elderaded07e2012-07-03 16:01:18 -05001416 ops[0].cls.class_name = class_name;
1417 ops[0].cls.class_len = (__u8) class_name_len;
1418 ops[0].cls.method_name = method_name;
1419 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001420 ops[0].cls.argc = 0;
1421 ops[0].cls.indata = data;
1422 ops[0].cls.indata_len = len;
1423
Alex Elder0ce1a792012-07-03 16:01:18 -05001424 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001425 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001426 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1427 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001428 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001429
1430 rbd_destroy_ops(ops);
1431
1432 dout("cls_exec returned %d\n", ret);
1433 return ret;
1434}
1435
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001436static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1437{
1438 struct rbd_req_coll *coll =
1439 kzalloc(sizeof(struct rbd_req_coll) +
1440 sizeof(struct rbd_req_status) * num_reqs,
1441 GFP_ATOMIC);
1442
1443 if (!coll)
1444 return NULL;
1445 coll->total = num_reqs;
1446 kref_init(&coll->kref);
1447 return coll;
1448}
1449
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001450/*
1451 * block device queue callback
1452 */
1453static void rbd_rq_fn(struct request_queue *q)
1454{
1455 struct rbd_device *rbd_dev = q->queuedata;
1456 struct request *rq;
1457 struct bio_pair *bp = NULL;
1458
Alex Elder00f1f362012-02-07 12:03:36 -06001459 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001460 struct bio *bio;
1461 struct bio *rq_bio, *next_bio = NULL;
1462 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001463 unsigned int size;
1464 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001465 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001466 int num_segs, cur_seg = 0;
1467 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001468 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001469
1470 /* peek at request from block layer */
1471 if (!rq)
1472 break;
1473
1474 dout("fetched request\n");
1475
1476 /* filter out block requests we don't understand */
1477 if ((rq->cmd_type != REQ_TYPE_FS)) {
1478 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001479 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480 }
1481
1482 /* deduce our operation (read, write) */
1483 do_write = (rq_data_dir(rq) == WRITE);
1484
1485 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001486 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001487 rq_bio = rq->bio;
1488 if (do_write && rbd_dev->read_only) {
1489 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001490 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001491 }
1492
1493 spin_unlock_irq(q->queue_lock);
1494
Josh Durgind1d25642011-12-05 14:03:05 -08001495 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001496
Josh Durgind1d25642011-12-05 14:03:05 -08001497 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001498 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001499 dout("request for non-existent snapshot");
1500 spin_lock_irq(q->queue_lock);
1501 __blk_end_request_all(rq, -ENXIO);
1502 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001503 }
1504
Josh Durgind1d25642011-12-05 14:03:05 -08001505 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1506
1507 up_read(&rbd_dev->header_rwsem);
1508
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001509 dout("%s 0x%x bytes at 0x%llx\n",
1510 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001511 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001512
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001513 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1514 coll = rbd_alloc_coll(num_segs);
1515 if (!coll) {
1516 spin_lock_irq(q->queue_lock);
1517 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001518 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001519 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001520 }
1521
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522 do {
1523 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001524 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001526 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527 ofs, size,
1528 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001529 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1531 op_size, GFP_ATOMIC);
1532 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001533 rbd_coll_end_req_index(rq, coll, cur_seg,
1534 -ENOMEM, op_size);
1535 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001536 }
1537
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001538
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 /* init OSD command: write or read */
1540 if (do_write)
1541 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001542 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001544 op_size, bio,
1545 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001546 else
1547 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001548 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001549 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001550 op_size, bio,
1551 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001552
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001553next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001554 size -= op_size;
1555 ofs += op_size;
1556
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001557 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558 rq_bio = next_bio;
1559 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001560 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001561
1562 if (bp)
1563 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001564 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001565
1566 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567 }
1568}
1569
1570/*
1571 * a queue callback. Makes sure that we don't create a bio that spans across
1572 * multiple osd objects. One exception would be with a single page bios,
1573 * which we handle later at bio_chain_clone
1574 */
1575static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1576 struct bio_vec *bvec)
1577{
1578 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001579 unsigned int chunk_sectors;
1580 sector_t sector;
1581 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001582 int max;
1583
Alex Elder593a9e72012-02-07 12:03:37 -06001584 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1585 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1586 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1587
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001588 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001589 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590 if (max < 0)
1591 max = 0; /* bio_add cannot handle a negative return */
1592 if (max <= bvec->bv_len && bio_sectors == 0)
1593 return bvec->bv_len;
1594 return max;
1595}
1596
1597static void rbd_free_disk(struct rbd_device *rbd_dev)
1598{
1599 struct gendisk *disk = rbd_dev->disk;
1600
1601 if (!disk)
1602 return;
1603
1604 rbd_header_free(&rbd_dev->header);
1605
1606 if (disk->flags & GENHD_FL_UP)
1607 del_gendisk(disk);
1608 if (disk->queue)
1609 blk_cleanup_queue(disk->queue);
1610 put_disk(disk);
1611}
1612
1613/*
1614 * reload the ondisk the header
1615 */
1616static int rbd_read_header(struct rbd_device *rbd_dev,
1617 struct rbd_image_header *header)
1618{
1619 ssize_t rc;
1620 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001621 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001622 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001623 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001624
Alex Elder00f1f362012-02-07 12:03:36 -06001625 /*
1626 * First reads the fixed-size header to determine the number
1627 * of snapshots, then re-reads it, along with all snapshot
1628 * records as well as their stored names.
1629 */
1630 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001631 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001632 dh = kmalloc(len, GFP_KERNEL);
1633 if (!dh)
1634 return -ENOMEM;
1635
1636 rc = rbd_req_sync_read(rbd_dev,
Alex Elder9a5d6902012-07-19 09:09:27 -05001637 CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001638 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001639 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001640 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001641 if (rc < 0)
1642 goto out_dh;
1643
Alex Eldered63f4f2012-07-19 09:09:27 -05001644 rc = rbd_header_from_disk(header, dh, snap_count);
Josh Durgin81e759f2011-11-15 14:49:53 -08001645 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001646 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001647 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001648 " for image %s\n",
1649 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001650 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001651 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001652
Alex Elder00f1f362012-02-07 12:03:36 -06001653 if (snap_count == header->total_snaps)
1654 break;
1655
1656 snap_count = header->total_snaps;
1657 len = sizeof (*dh) +
1658 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1659 header->snap_names_len;
1660
1661 rbd_header_free(header);
1662 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001663 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001664 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001665
1666out_dh:
1667 kfree(dh);
1668 return rc;
1669}
1670
1671/*
1672 * create a snapshot
1673 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001674static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001675 const char *snap_name,
1676 gfp_t gfp_flags)
1677{
1678 int name_len = strlen(snap_name);
1679 u64 new_snapid;
1680 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001681 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001682 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001683
1684 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001685 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001686 return -EINVAL;
1687
Alex Elder0ce1a792012-07-03 16:01:18 -05001688 monc = &rbd_dev->rbd_client->client->monc;
1689 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001690 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001691 if (ret < 0)
1692 return ret;
1693
1694 data = kmalloc(name_len + 16, gfp_flags);
1695 if (!data)
1696 return -ENOMEM;
1697
Sage Weil916d4d62011-05-12 16:10:50 -07001698 p = data;
1699 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001700
Sage Weil916d4d62011-05-12 16:10:50 -07001701 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1702 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001703
Alex Elder0bed54d2012-07-03 16:01:18 -05001704 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001705 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001706 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001707
Sage Weil916d4d62011-05-12 16:10:50 -07001708 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001709
Alex Elder505cbb92012-07-19 08:49:18 -05001710 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001711bad:
1712 return -ERANGE;
1713}
1714
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001715static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1716{
1717 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001718 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001719
Alex Eldera0593292012-07-19 09:09:27 -05001720 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001721 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001722}
1723
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001724/*
1725 * only read the first part of the ondisk header, without the snaps info
1726 */
Alex Elderb8136232012-07-25 09:32:41 -05001727static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001728{
1729 int ret;
1730 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001731
1732 ret = rbd_read_header(rbd_dev, &h);
1733 if (ret < 0)
1734 return ret;
1735
Josh Durgina51aa0c2011-12-05 10:35:04 -08001736 down_write(&rbd_dev->header_rwsem);
1737
Sage Weil9db4b3e2011-04-19 22:49:06 -07001738 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001739 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1740 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1741
1742 dout("setting size to %llu sectors", (unsigned long long) size);
1743 set_capacity(rbd_dev->disk, size);
1744 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001745
Alex Elder849b4262012-07-09 21:04:24 -05001746 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001747 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001748 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001749 /* osd requests may still refer to snapc */
1750 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001751
Alex Elderb8136232012-07-25 09:32:41 -05001752 if (hver)
1753 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001754 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001755 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001756 rbd_dev->header.total_snaps = h.total_snaps;
1757 rbd_dev->header.snapc = h.snapc;
1758 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001759 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001760 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001761 /* Free the extra copy of the object prefix */
1762 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1763 kfree(h.object_prefix);
1764
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001765 ret = __rbd_init_snaps_header(rbd_dev);
1766
Josh Durginc6666012011-11-21 17:11:12 -08001767 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001768
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001769 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001770}
1771
Alex Elder1fe5e992012-07-25 09:32:41 -05001772static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1773{
1774 int ret;
1775
1776 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1777 ret = __rbd_refresh_header(rbd_dev, hver);
1778 mutex_unlock(&ctl_mutex);
1779
1780 return ret;
1781}
1782
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001783static int rbd_init_disk(struct rbd_device *rbd_dev)
1784{
1785 struct gendisk *disk;
1786 struct request_queue *q;
1787 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001788 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001789 u64 total_size = 0;
1790
1791 /* contact OSD, request size info about the object being mapped */
1792 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1793 if (rc)
1794 return rc;
1795
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001796 /* no need to lock here, as rbd_dev is not registered yet */
1797 rc = __rbd_init_snaps_header(rbd_dev);
1798 if (rc)
1799 return rc;
1800
Josh Durgincc9d7342011-11-21 18:19:13 -08001801 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001802 if (rc)
1803 return rc;
1804
1805 /* create gendisk info */
1806 rc = -ENOMEM;
1807 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1808 if (!disk)
1809 goto out;
1810
Alex Elderf0f8cef2012-01-29 13:57:44 -06001811 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001812 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813 disk->major = rbd_dev->major;
1814 disk->first_minor = 0;
1815 disk->fops = &rbd_bd_ops;
1816 disk->private_data = rbd_dev;
1817
1818 /* init rq */
1819 rc = -ENOMEM;
1820 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1821 if (!q)
1822 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001823
Alex Elder593a9e72012-02-07 12:03:37 -06001824 /* We use the default size, but let's be explicit about it. */
1825 blk_queue_physical_block_size(q, SECTOR_SIZE);
1826
Josh Durgin029bcbd2011-07-22 11:35:23 -07001827 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001828 segment_size = rbd_obj_bytes(&rbd_dev->header);
1829 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1830 blk_queue_max_segment_size(q, segment_size);
1831 blk_queue_io_min(q, segment_size);
1832 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001833
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001834 blk_queue_merge_bvec(q, rbd_merge_bvec);
1835 disk->queue = q;
1836
1837 q->queuedata = rbd_dev;
1838
1839 rbd_dev->disk = disk;
1840 rbd_dev->q = q;
1841
1842 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001843 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001844 add_disk(disk);
1845
1846 pr_info("%s: added with size 0x%llx\n",
1847 disk->disk_name, (unsigned long long)total_size);
1848 return 0;
1849
1850out_disk:
1851 put_disk(disk);
1852out:
1853 return rc;
1854}
1855
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001856/*
1857 sysfs
1858*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001859
Alex Elder593a9e72012-02-07 12:03:37 -06001860static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1861{
1862 return container_of(dev, struct rbd_device, dev);
1863}
1864
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001865static ssize_t rbd_size_show(struct device *dev,
1866 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001867{
Alex Elder593a9e72012-02-07 12:03:37 -06001868 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001869 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001870
Josh Durgina51aa0c2011-12-05 10:35:04 -08001871 down_read(&rbd_dev->header_rwsem);
1872 size = get_capacity(rbd_dev->disk);
1873 up_read(&rbd_dev->header_rwsem);
1874
1875 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001876}
1877
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001878static ssize_t rbd_major_show(struct device *dev,
1879 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001880{
Alex Elder593a9e72012-02-07 12:03:37 -06001881 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001882
1883 return sprintf(buf, "%d\n", rbd_dev->major);
1884}
1885
1886static ssize_t rbd_client_id_show(struct device *dev,
1887 struct device_attribute *attr, char *buf)
1888{
Alex Elder593a9e72012-02-07 12:03:37 -06001889 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001890
Alex Elder1dbb4392012-01-24 10:08:37 -06001891 return sprintf(buf, "client%lld\n",
1892 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001893}
1894
1895static ssize_t rbd_pool_show(struct device *dev,
1896 struct device_attribute *attr, char *buf)
1897{
Alex Elder593a9e72012-02-07 12:03:37 -06001898 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001899
1900 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1901}
1902
Alex Elder9bb2f332012-07-12 10:46:35 -05001903static ssize_t rbd_pool_id_show(struct device *dev,
1904 struct device_attribute *attr, char *buf)
1905{
1906 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1907
1908 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1909}
1910
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001911static ssize_t rbd_name_show(struct device *dev,
1912 struct device_attribute *attr, char *buf)
1913{
Alex Elder593a9e72012-02-07 12:03:37 -06001914 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001915
Alex Elder0bed54d2012-07-03 16:01:18 -05001916 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917}
1918
1919static ssize_t rbd_snap_show(struct device *dev,
1920 struct device_attribute *attr,
1921 char *buf)
1922{
Alex Elder593a9e72012-02-07 12:03:37 -06001923 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001924
1925 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1926}
1927
1928static ssize_t rbd_image_refresh(struct device *dev,
1929 struct device_attribute *attr,
1930 const char *buf,
1931 size_t size)
1932{
Alex Elder593a9e72012-02-07 12:03:37 -06001933 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001934 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001935
Alex Elder1fe5e992012-07-25 09:32:41 -05001936 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001937
1938 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001939}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001940
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001941static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1942static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1943static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1944static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001945static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001946static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1947static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1948static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1949static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001950
1951static struct attribute *rbd_attrs[] = {
1952 &dev_attr_size.attr,
1953 &dev_attr_major.attr,
1954 &dev_attr_client_id.attr,
1955 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001956 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001957 &dev_attr_name.attr,
1958 &dev_attr_current_snap.attr,
1959 &dev_attr_refresh.attr,
1960 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001961 NULL
1962};
1963
1964static struct attribute_group rbd_attr_group = {
1965 .attrs = rbd_attrs,
1966};
1967
1968static const struct attribute_group *rbd_attr_groups[] = {
1969 &rbd_attr_group,
1970 NULL
1971};
1972
1973static void rbd_sysfs_dev_release(struct device *dev)
1974{
1975}
1976
1977static struct device_type rbd_device_type = {
1978 .name = "rbd",
1979 .groups = rbd_attr_groups,
1980 .release = rbd_sysfs_dev_release,
1981};
1982
1983
1984/*
1985 sysfs - snapshots
1986*/
1987
1988static ssize_t rbd_snap_size_show(struct device *dev,
1989 struct device_attribute *attr,
1990 char *buf)
1991{
1992 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1993
Josh Durgin35915382011-12-05 18:25:13 -08001994 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001995}
1996
1997static ssize_t rbd_snap_id_show(struct device *dev,
1998 struct device_attribute *attr,
1999 char *buf)
2000{
2001 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2002
Josh Durgin35915382011-12-05 18:25:13 -08002003 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002004}
2005
2006static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2007static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2008
2009static struct attribute *rbd_snap_attrs[] = {
2010 &dev_attr_snap_size.attr,
2011 &dev_attr_snap_id.attr,
2012 NULL,
2013};
2014
2015static struct attribute_group rbd_snap_attr_group = {
2016 .attrs = rbd_snap_attrs,
2017};
2018
2019static void rbd_snap_dev_release(struct device *dev)
2020{
2021 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2022 kfree(snap->name);
2023 kfree(snap);
2024}
2025
2026static const struct attribute_group *rbd_snap_attr_groups[] = {
2027 &rbd_snap_attr_group,
2028 NULL
2029};
2030
2031static struct device_type rbd_snap_device_type = {
2032 .groups = rbd_snap_attr_groups,
2033 .release = rbd_snap_dev_release,
2034};
2035
Alex Elder14e70852012-07-19 09:09:27 -05002036static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002037{
2038 list_del(&snap->node);
2039 device_unregister(&snap->dev);
2040}
2041
Alex Elder14e70852012-07-19 09:09:27 -05002042static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002043 struct device *parent)
2044{
2045 struct device *dev = &snap->dev;
2046 int ret;
2047
2048 dev->type = &rbd_snap_device_type;
2049 dev->parent = parent;
2050 dev->release = rbd_snap_dev_release;
2051 dev_set_name(dev, "snap_%s", snap->name);
2052 ret = device_register(dev);
2053
2054 return ret;
2055}
2056
Alex Elder4e891e02012-07-10 20:30:10 -05002057static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2058 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002059{
Alex Elder4e891e02012-07-10 20:30:10 -05002060 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002061 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002062
2063 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002064 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002065 return ERR_PTR(-ENOMEM);
2066
2067 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002068 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002069 if (!snap->name)
2070 goto err;
2071
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002072 snap->size = rbd_dev->header.snap_sizes[i];
2073 snap->id = rbd_dev->header.snapc->snaps[i];
2074 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002075 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002076 if (ret < 0)
2077 goto err;
2078 }
Alex Elder4e891e02012-07-10 20:30:10 -05002079
2080 return snap;
2081
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002082err:
2083 kfree(snap->name);
2084 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002085
2086 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002087}
2088
2089/*
Alex Elder35938152012-08-02 11:29:46 -05002090 * Scan the rbd device's current snapshot list and compare it to the
2091 * newly-received snapshot context. Remove any existing snapshots
2092 * not present in the new snapshot context. Add a new snapshot for
2093 * any snaphots in the snapshot context not in the current list.
2094 * And verify there are no changes to snapshots we already know
2095 * about.
2096 *
2097 * Assumes the snapshots in the snapshot context are sorted by
2098 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2099 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002100 */
2101static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2102{
Alex Elder35938152012-08-02 11:29:46 -05002103 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2104 const u32 snap_count = snapc->num_snaps;
2105 char *snap_name = rbd_dev->header.snap_names;
2106 struct list_head *head = &rbd_dev->snaps;
2107 struct list_head *links = head->next;
2108 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002109
Alex Elder35938152012-08-02 11:29:46 -05002110 while (index < snap_count || links != head) {
2111 u64 snap_id;
2112 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002113
Alex Elder35938152012-08-02 11:29:46 -05002114 snap_id = index < snap_count ? snapc->snaps[index]
2115 : CEPH_NOSNAP;
2116 snap = links != head ? list_entry(links, struct rbd_snap, node)
2117 : NULL;
2118 BUG_ON(snap && snap->id == CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119
Alex Elder35938152012-08-02 11:29:46 -05002120 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2121 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002122
Alex Elder35938152012-08-02 11:29:46 -05002123 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002124
Alex Elder35938152012-08-02 11:29:46 -05002125 if (rbd_dev->snap_id == snap->id)
Josh Durgine88a36e2011-11-21 18:14:25 -08002126 rbd_dev->snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002127 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002128
Alex Elder35938152012-08-02 11:29:46 -05002129 /* Done with this list entry; advance */
2130
2131 links = next;
2132 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002133 }
Alex Elder35938152012-08-02 11:29:46 -05002134
2135 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2136 struct rbd_snap *new_snap;
2137
2138 /* We haven't seen this snapshot before */
2139
2140 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2141 snap_name);
2142 if (IS_ERR(new_snap))
2143 return PTR_ERR(new_snap);
2144
2145 /* New goes before existing, or at end of list */
2146
2147 if (snap)
2148 list_add_tail(&new_snap->node, &snap->node);
2149 else
2150 list_add(&new_snap->node, head);
2151 } else {
2152 /* Already have this one */
2153
2154 BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]);
2155 BUG_ON(strcmp(snap->name, snap_name));
2156
2157 /* Done with this list entry; advance */
2158
2159 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002160 }
Alex Elder35938152012-08-02 11:29:46 -05002161
2162 /* Advance to the next entry in the snapshot context */
2163
2164 index++;
2165 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166 }
2167
2168 return 0;
2169}
2170
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002171static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2172{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002173 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002174 struct device *dev;
2175 struct rbd_snap *snap;
2176
2177 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2178 dev = &rbd_dev->dev;
2179
2180 dev->bus = &rbd_bus_type;
2181 dev->type = &rbd_device_type;
2182 dev->parent = &rbd_root_dev;
2183 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002184 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002185 ret = device_register(dev);
2186 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002187 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002188
2189 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002190 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002191 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002192 break;
2193 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002194out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002195 mutex_unlock(&ctl_mutex);
2196 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002197}
2198
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002199static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2200{
2201 device_unregister(&rbd_dev->dev);
2202}
2203
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002204static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2205{
2206 int ret, rc;
2207
2208 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002209 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002210 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002211 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002212 if (rc < 0)
2213 return rc;
2214 }
2215 } while (ret == -ERANGE);
2216
2217 return ret;
2218}
2219
Alex Elder1ddbe942012-01-29 13:57:44 -06002220static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2221
2222/*
Alex Elder499afd52012-02-02 08:13:29 -06002223 * Get a unique rbd identifier for the given new rbd_dev, and add
2224 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002225 */
Alex Elder499afd52012-02-02 08:13:29 -06002226static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002227{
Alex Elderde71a292012-07-03 16:01:19 -05002228 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002229
2230 spin_lock(&rbd_dev_list_lock);
2231 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2232 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002233}
Alex Elderb7f23c32012-01-29 13:57:43 -06002234
Alex Elder1ddbe942012-01-29 13:57:44 -06002235/*
Alex Elder499afd52012-02-02 08:13:29 -06002236 * Remove an rbd_dev from the global list, and record that its
2237 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002238 */
Alex Elder499afd52012-02-02 08:13:29 -06002239static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002240{
Alex Elderd184f6b2012-01-29 13:57:44 -06002241 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002242 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002243 int max_id;
2244
2245 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002246
2247 spin_lock(&rbd_dev_list_lock);
2248 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002249
2250 /*
2251 * If the id being "put" is not the current maximum, there
2252 * is nothing special we need to do.
2253 */
2254 if (rbd_id != atomic64_read(&rbd_id_max)) {
2255 spin_unlock(&rbd_dev_list_lock);
2256 return;
2257 }
2258
2259 /*
2260 * We need to update the current maximum id. Search the
2261 * list to find out what it is. We're more likely to find
2262 * the maximum at the end, so search the list backward.
2263 */
2264 max_id = 0;
2265 list_for_each_prev(tmp, &rbd_dev_list) {
2266 struct rbd_device *rbd_dev;
2267
2268 rbd_dev = list_entry(tmp, struct rbd_device, node);
2269 if (rbd_id > max_id)
2270 max_id = rbd_id;
2271 }
Alex Elder499afd52012-02-02 08:13:29 -06002272 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002273
Alex Elder1ddbe942012-01-29 13:57:44 -06002274 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002275 * The max id could have been updated by rbd_id_get(), in
2276 * which case it now accurately reflects the new maximum.
2277 * Be careful not to overwrite the maximum value in that
2278 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002279 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002280 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002281}
2282
Alex Eldera725f65e2012-02-02 08:13:30 -06002283/*
Alex Eldere28fff262012-02-02 08:13:30 -06002284 * Skips over white space at *buf, and updates *buf to point to the
2285 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002286 * the token (string of non-white space characters) found. Note
2287 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002288 */
2289static inline size_t next_token(const char **buf)
2290{
2291 /*
2292 * These are the characters that produce nonzero for
2293 * isspace() in the "C" and "POSIX" locales.
2294 */
2295 const char *spaces = " \f\n\r\t\v";
2296
2297 *buf += strspn(*buf, spaces); /* Find start of token */
2298
2299 return strcspn(*buf, spaces); /* Return token length */
2300}
2301
2302/*
2303 * Finds the next token in *buf, and if the provided token buffer is
2304 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002305 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2306 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002307 *
2308 * Returns the length of the token found (not including the '\0').
2309 * Return value will be 0 if no token is found, and it will be >=
2310 * token_size if the token would not fit.
2311 *
Alex Elder593a9e72012-02-07 12:03:37 -06002312 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002313 * found token. Note that this occurs even if the token buffer is
2314 * too small to hold it.
2315 */
2316static inline size_t copy_token(const char **buf,
2317 char *token,
2318 size_t token_size)
2319{
2320 size_t len;
2321
2322 len = next_token(buf);
2323 if (len < token_size) {
2324 memcpy(token, *buf, len);
2325 *(token + len) = '\0';
2326 }
2327 *buf += len;
2328
2329 return len;
2330}
2331
2332/*
Alex Elderea3352f2012-07-09 21:04:23 -05002333 * Finds the next token in *buf, dynamically allocates a buffer big
2334 * enough to hold a copy of it, and copies the token into the new
2335 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2336 * that a duplicate buffer is created even for a zero-length token.
2337 *
2338 * Returns a pointer to the newly-allocated duplicate, or a null
2339 * pointer if memory for the duplicate was not available. If
2340 * the lenp argument is a non-null pointer, the length of the token
2341 * (not including the '\0') is returned in *lenp.
2342 *
2343 * If successful, the *buf pointer will be updated to point beyond
2344 * the end of the found token.
2345 *
2346 * Note: uses GFP_KERNEL for allocation.
2347 */
2348static inline char *dup_token(const char **buf, size_t *lenp)
2349{
2350 char *dup;
2351 size_t len;
2352
2353 len = next_token(buf);
2354 dup = kmalloc(len + 1, GFP_KERNEL);
2355 if (!dup)
2356 return NULL;
2357
2358 memcpy(dup, *buf, len);
2359 *(dup + len) = '\0';
2360 *buf += len;
2361
2362 if (lenp)
2363 *lenp = len;
2364
2365 return dup;
2366}
2367
2368/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002369 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002370 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2371 * on the list of monitor addresses and other options provided via
2372 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002373 *
2374 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002375 */
2376static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2377 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002378 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002379 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002380 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002381 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002382{
Alex Elderd22f76e2012-07-12 10:46:35 -05002383 size_t len;
2384 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002385
2386 /* The first four tokens are required */
2387
Alex Elder7ef32142012-02-02 08:13:30 -06002388 len = next_token(&buf);
2389 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002390 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002391 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002392 *mon_addrs = buf;
2393
2394 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002395
Alex Eldere28fff262012-02-02 08:13:30 -06002396 len = copy_token(&buf, options, options_size);
2397 if (!len || len >= options_size)
2398 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002399
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002400 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002401 rbd_dev->pool_name = dup_token(&buf, NULL);
2402 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002403 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002404
Alex Elder0bed54d2012-07-03 16:01:18 -05002405 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2406 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002407 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002408
Alex Eldercb8627c2012-07-09 21:04:23 -05002409 /* Create the name of the header object */
2410
Alex Elder0bed54d2012-07-03 16:01:18 -05002411 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002412 + sizeof (RBD_SUFFIX),
2413 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002414 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002415 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002416 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002417
Alex Eldere28fff262012-02-02 08:13:30 -06002418 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002419 * The snapshot name is optional. If none is is supplied,
2420 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002421 */
Alex Elder820a5f32012-07-09 21:04:24 -05002422 rbd_dev->snap_name = dup_token(&buf, &len);
2423 if (!rbd_dev->snap_name)
2424 goto out_err;
2425 if (!len) {
2426 /* Replace the empty name with the default */
2427 kfree(rbd_dev->snap_name);
2428 rbd_dev->snap_name
2429 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2430 if (!rbd_dev->snap_name)
2431 goto out_err;
2432
Alex Eldere28fff262012-02-02 08:13:30 -06002433 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2434 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002435 }
Alex Eldere28fff262012-02-02 08:13:30 -06002436
Alex Eldera725f65e2012-02-02 08:13:30 -06002437 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002438
2439out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002440 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002441 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002442 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002443 rbd_dev->image_name = NULL;
2444 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002445 kfree(rbd_dev->pool_name);
2446 rbd_dev->pool_name = NULL;
2447
2448 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002449}
2450
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002451static ssize_t rbd_add(struct bus_type *bus,
2452 const char *buf,
2453 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002454{
Alex Eldercb8627c2012-07-09 21:04:23 -05002455 char *options;
2456 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002457 const char *mon_addrs = NULL;
2458 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002459 struct ceph_osd_client *osdc;
2460 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002461
2462 if (!try_module_get(THIS_MODULE))
2463 return -ENODEV;
2464
Alex Elder27cc2592012-02-02 08:13:30 -06002465 options = kmalloc(count, GFP_KERNEL);
2466 if (!options)
2467 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002468 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2469 if (!rbd_dev)
2470 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002471
2472 /* static rbd_device initialization */
2473 spin_lock_init(&rbd_dev->lock);
2474 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002475 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002476 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002477
Alex Elderd184f6b2012-01-29 13:57:44 -06002478 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002479 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002480
Alex Eldera725f65e2012-02-02 08:13:30 -06002481 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002482 BUILD_BUG_ON(DEV_NAME_LEN
2483 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002484 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a82f2012-01-29 13:57:44 -06002485
Alex Eldera725f65e2012-02-02 08:13:30 -06002486 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002487 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002488 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002489 if (rc)
2490 goto err_put_id;
2491
Alex Elder5214ecc2012-02-02 08:13:30 -06002492 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2493 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002494 if (IS_ERR(rbd_dev->rbd_client)) {
2495 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002496 rbd_dev->rbd_client = NULL;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002497 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002498 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002499
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002500 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002501 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002502 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2503 if (rc < 0)
2504 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002505 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002506
2507 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002508 rc = register_blkdev(0, rbd_dev->name);
2509 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002510 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002511 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002512
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002513 rc = rbd_bus_add_dev(rbd_dev);
2514 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002515 goto err_out_blkdev;
2516
Alex Elder32eec682012-02-08 16:11:14 -06002517 /*
2518 * At this point cleanup in the event of an error is the job
2519 * of the sysfs code (initiated by rbd_bus_del_dev()).
2520 *
2521 * Set up and announce blkdev mapping.
2522 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002523 rc = rbd_init_disk(rbd_dev);
2524 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002525 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002526
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002527 rc = rbd_init_watch_dev(rbd_dev);
2528 if (rc)
2529 goto err_out_bus;
2530
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002531 return count;
2532
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002533err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002534 /* this will also clean up rest of rbd_dev stuff */
2535
2536 rbd_bus_del_dev(rbd_dev);
2537 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002538 return rc;
2539
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002540err_out_blkdev:
2541 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2542err_out_client:
2543 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002544err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002545 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002546 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002547 kfree(rbd_dev->header_name);
2548 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002549 kfree(rbd_dev->pool_name);
2550 }
Alex Elder499afd52012-02-02 08:13:29 -06002551 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002552err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002553 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002554 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002555
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002556 dout("Error adding device %s\n", buf);
2557 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002558
2559 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002560}
2561
Alex Elderde71a292012-07-03 16:01:19 -05002562static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002563{
2564 struct list_head *tmp;
2565 struct rbd_device *rbd_dev;
2566
Alex Eldere124a82f2012-01-29 13:57:44 -06002567 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002568 list_for_each(tmp, &rbd_dev_list) {
2569 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002570 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06002571 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002572 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002573 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002574 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002575 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002576 return NULL;
2577}
2578
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002579static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002580{
Alex Elder593a9e72012-02-07 12:03:37 -06002581 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002582
Alex Elder1dbb4392012-01-24 10:08:37 -06002583 if (rbd_dev->watch_request) {
2584 struct ceph_client *client = rbd_dev->rbd_client->client;
2585
2586 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002587 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002588 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002589 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002590 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002591
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002592 rbd_put_client(rbd_dev);
2593
2594 /* clean up and free blkdev */
2595 rbd_free_disk(rbd_dev);
2596 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002597
2598 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002599 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002600 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002601 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002602 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002603 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002604 kfree(rbd_dev);
2605
2606 /* release module ref */
2607 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002608}
2609
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002610static ssize_t rbd_remove(struct bus_type *bus,
2611 const char *buf,
2612 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002613{
2614 struct rbd_device *rbd_dev = NULL;
2615 int target_id, rc;
2616 unsigned long ul;
2617 int ret = count;
2618
2619 rc = strict_strtoul(buf, 10, &ul);
2620 if (rc)
2621 return rc;
2622
2623 /* convert to int; abort if we lost anything in the conversion */
2624 target_id = (int) ul;
2625 if (target_id != ul)
2626 return -EINVAL;
2627
2628 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2629
2630 rbd_dev = __rbd_get_dev(target_id);
2631 if (!rbd_dev) {
2632 ret = -ENOENT;
2633 goto done;
2634 }
2635
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002636 __rbd_remove_all_snaps(rbd_dev);
2637 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002638
2639done:
2640 mutex_unlock(&ctl_mutex);
2641 return ret;
2642}
2643
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002644static ssize_t rbd_snap_add(struct device *dev,
2645 struct device_attribute *attr,
2646 const char *buf,
2647 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002648{
Alex Elder593a9e72012-02-07 12:03:37 -06002649 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002650 int ret;
2651 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002652 if (!name)
2653 return -ENOMEM;
2654
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002655 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002656
2657 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2658
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002659 ret = rbd_header_add_snap(rbd_dev,
2660 name, GFP_KERNEL);
2661 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002662 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002663
Alex Elderb8136232012-07-25 09:32:41 -05002664 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002665 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002666 goto err_unlock;
2667
2668 /* shouldn't hold ctl_mutex when notifying.. notify might
2669 trigger a watch callback that would need to get that mutex */
2670 mutex_unlock(&ctl_mutex);
2671
2672 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002673 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002674
2675 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002676 kfree(name);
2677 return ret;
2678
2679err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002680 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002681 kfree(name);
2682 return ret;
2683}
2684
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002685/*
2686 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002687 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002688 */
2689static int rbd_sysfs_init(void)
2690{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002691 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002692
Alex Elderfed4c142012-02-07 12:03:36 -06002693 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002694 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002695 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002696
Alex Elderfed4c142012-02-07 12:03:36 -06002697 ret = bus_register(&rbd_bus_type);
2698 if (ret < 0)
2699 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002700
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002701 return ret;
2702}
2703
2704static void rbd_sysfs_cleanup(void)
2705{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002706 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002707 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002708}
2709
2710int __init rbd_init(void)
2711{
2712 int rc;
2713
2714 rc = rbd_sysfs_init();
2715 if (rc)
2716 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002717 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002718 return 0;
2719}
2720
2721void __exit rbd_exit(void)
2722{
2723 rbd_sysfs_cleanup();
2724}
2725
2726module_init(rbd_init);
2727module_exit(rbd_exit);
2728
2729MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2730MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2731MODULE_DESCRIPTION("rados block device");
2732
2733/* following authorship retained from original osdblk.c */
2734MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2735
2736MODULE_LICENSE("GPL");