blob: a27167942a9285d8d842049cdf629cd3152ad77a [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
Alex Elder0f1d3f92012-08-02 11:29:44 -050084 u64 snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070085 u32 total_snaps;
86
87 char *snap_names;
88 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070089
90 u64 obj_version;
91};
92
93struct rbd_options {
94 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095};
96
97/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060098 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099 */
100struct rbd_client {
101 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700102 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103 struct kref kref;
104 struct list_head node;
105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700110struct rbd_req_status {
111 int done;
112 int rc;
113 u64 bytes;
114};
115
116/*
117 * a collection of requests
118 */
119struct rbd_req_coll {
120 int total;
121 int num_done;
122 struct kref kref;
123 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700124};
125
Alex Elderf0f8cef2012-01-29 13:57:44 -0600126/*
127 * a single io request
128 */
129struct rbd_request {
130 struct request *rq; /* blk layer request */
131 struct bio *bio; /* cloned bio */
132 struct page **pages; /* list of used pages */
133 u64 len;
134 int coll_index;
135 struct rbd_req_coll *coll;
136};
137
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800138struct rbd_snap {
139 struct device dev;
140 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800141 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800142 struct list_head node;
143 u64 id;
144};
145
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146/*
147 * a single device
148 */
149struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500150 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151
152 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156 struct rbd_client *rbd_client;
157
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
159
160 spinlock_t lock; /* queue lock */
161
162 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500163 char *image_name;
164 size_t image_name_len;
165 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500166 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500167 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700168
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700169 struct ceph_osd_event *watch_event;
170 struct ceph_osd_request *watch_request;
171
Josh Durginc6666012011-11-21 17:11:12 -0800172 /* protects updating the header */
173 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800174 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800176 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800177 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
182 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183
184 /* list of snapshots */
185 struct list_head snaps;
186
187 /* sysfs related */
188 struct device dev;
189};
190
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700191static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600192
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700193static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600194static DEFINE_SPINLOCK(rbd_dev_list_lock);
195
Alex Elder432b8582012-01-29 13:57:44 -0600196static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500205static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800206
Alex Elderf0f8cef2012-01-29 13:57:44 -0600207static ssize_t rbd_add(struct bus_type *bus, const char *buf,
208 size_t count);
209static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
210 size_t count);
211
212static struct bus_attribute rbd_bus_attrs[] = {
213 __ATTR(add, S_IWUSR, NULL, rbd_add),
214 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
215 __ATTR_NULL
216};
217
218static struct bus_type rbd_bus_type = {
219 .name = "rbd",
220 .bus_attrs = rbd_bus_attrs,
221};
222
223static void rbd_root_dev_release(struct device *dev)
224{
225}
226
227static struct device rbd_root_dev = {
228 .init_name = "rbd",
229 .release = rbd_root_dev_release,
230};
231
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
234{
235 return get_device(&rbd_dev->dev);
236}
237
238static void rbd_put_dev(struct rbd_device *rbd_dev)
239{
240 put_device(&rbd_dev->dev);
241}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700242
Alex Elder1fe5e992012-07-25 09:32:41 -0500243static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700244
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700245static int rbd_open(struct block_device *bdev, fmode_t mode)
246{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
250 return -EROFS;
251
Alex Elder340c7a22012-08-10 13:12:07 -0700252 rbd_get_dev(rbd_dev);
253 set_device_ro(bdev, rbd_dev->read_only);
254
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255 return 0;
256}
257
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258static int rbd_release(struct gendisk *disk, fmode_t mode)
259{
260 struct rbd_device *rbd_dev = disk->private_data;
261
262 rbd_put_dev(rbd_dev);
263
264 return 0;
265}
266
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267static const struct block_device_operations rbd_bd_ops = {
268 .owner = THIS_MODULE,
269 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800270 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700271};
272
273/*
274 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500275 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700276 */
Alex Elder43ae4702012-07-03 16:01:18 -0500277static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700278 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279{
280 struct rbd_client *rbdc;
281 int ret = -ENOMEM;
282
283 dout("rbd_client_create\n");
284 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
285 if (!rbdc)
286 goto out_opt;
287
288 kref_init(&rbdc->kref);
289 INIT_LIST_HEAD(&rbdc->node);
290
Alex Elderbc534d82012-01-29 13:57:44 -0600291 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
292
Alex Elder43ae4702012-07-03 16:01:18 -0500293 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600295 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500296 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
298 ret = ceph_open_session(rbdc->client);
299 if (ret < 0)
300 goto out_err;
301
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700302 rbdc->rbd_opts = rbd_opts;
303
Alex Elder432b8582012-01-29 13:57:44 -0600304 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600306 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307
Alex Elderbc534d82012-01-29 13:57:44 -0600308 mutex_unlock(&ctl_mutex);
309
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310 dout("rbd_client_create created %p\n", rbdc);
311 return rbdc;
312
313out_err:
314 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600315out_mutex:
316 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317 kfree(rbdc);
318out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500319 if (ceph_opts)
320 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400321 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322}
323
324/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700325 * Find a ceph client with specific addr and configuration. If
326 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700328static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700329{
330 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700331 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332
Alex Elder43ae4702012-07-03 16:01:18 -0500333 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 return NULL;
335
Alex Elder1f7ba332012-08-10 13:12:07 -0700336 spin_lock(&rbd_client_list_lock);
337 list_for_each_entry(client_node, &rbd_client_list, node) {
338 if (!ceph_compare_options(ceph_opts, client_node->client)) {
339 kref_get(&client_node->kref);
340 found = true;
341 break;
342 }
343 }
344 spin_unlock(&rbd_client_list_lock);
345
346 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347}
348
349/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700350 * mount options
351 */
352enum {
353 Opt_notify_timeout,
354 Opt_last_int,
355 /* int args above */
356 Opt_last_string,
357 /* string args above */
358};
359
Alex Elder43ae4702012-07-03 16:01:18 -0500360static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361 {Opt_notify_timeout, "notify_timeout=%d"},
362 /* int args above */
363 /* string args above */
364 {-1, NULL}
365};
366
367static int parse_rbd_opts_token(char *c, void *private)
368{
Alex Elder43ae4702012-07-03 16:01:18 -0500369 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700370 substring_t argstr[MAX_OPT_ARGS];
371 int token, intval, ret;
372
Alex Elder43ae4702012-07-03 16:01:18 -0500373 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700374 if (token < 0)
375 return -EINVAL;
376
377 if (token < Opt_last_int) {
378 ret = match_int(&argstr[0], &intval);
379 if (ret < 0) {
380 pr_err("bad mount option arg (not int) "
381 "at '%s'\n", c);
382 return ret;
383 }
384 dout("got int token %d val %d\n", token, intval);
385 } else if (token > Opt_last_int && token < Opt_last_string) {
386 dout("got string token %d val %s\n", token,
387 argstr[0].from);
388 } else {
389 dout("got token %d\n", token);
390 }
391
392 switch (token) {
393 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500394 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700395 break;
396 default:
397 BUG_ON(token);
398 }
399 return 0;
400}
401
402/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700403 * Get a ceph client with specific addr and configuration, if one does
404 * not exist create it.
405 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600406static struct rbd_client *rbd_get_client(const char *mon_addr,
407 size_t mon_addr_len,
408 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700409{
410 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500411 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700412 struct rbd_options *rbd_opts;
413
414 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
415 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600416 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700417
418 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419
Alex Elder43ae4702012-07-03 16:01:18 -0500420 ceph_opts = ceph_parse_options(options, mon_addr,
421 mon_addr + mon_addr_len,
422 parse_rbd_opts_token, rbd_opts);
423 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600424 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500425 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600426 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700427
Alex Elder1f7ba332012-08-10 13:12:07 -0700428 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700429 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600430 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500431 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600432 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433
Alex Elderd720bcb2012-02-02 08:13:30 -0600434 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436
Alex Elder43ae4702012-07-03 16:01:18 -0500437 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd720bcb2012-02-02 08:13:30 -0600438 if (IS_ERR(rbdc))
439 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440
Alex Elderd720bcb2012-02-02 08:13:30 -0600441 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700442}
443
444/*
445 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600446 *
Alex Elder432b8582012-01-29 13:57:44 -0600447 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700448 */
449static void rbd_client_release(struct kref *kref)
450{
451 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
452
453 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500454 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700455 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500456 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457
458 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700459 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700460 kfree(rbdc);
461}
462
463/*
464 * Drop reference to ceph client node. If it's not referenced anymore, release
465 * it.
466 */
467static void rbd_put_client(struct rbd_device *rbd_dev)
468{
469 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
470 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471}
472
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700473/*
474 * Destroy requests collection
475 */
476static void rbd_coll_release(struct kref *kref)
477{
478 struct rbd_req_coll *coll =
479 container_of(kref, struct rbd_req_coll, kref);
480
481 dout("rbd_coll_release %p\n", coll);
482 kfree(coll);
483}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700484
Alex Elder8e94af82012-07-25 09:32:40 -0500485static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
486{
Alex Elder103a1502012-08-02 11:29:45 -0500487 size_t size;
488 u32 snap_count;
489
490 /* The header has to start with the magic rbd header text */
491 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
492 return false;
493
494 /*
495 * The size of a snapshot header has to fit in a size_t, and
496 * that limits the number of snapshots.
497 */
498 snap_count = le32_to_cpu(ondisk->snap_count);
499 size = SIZE_MAX - sizeof (struct ceph_snap_context);
500 if (snap_count > size / sizeof (__le64))
501 return false;
502
503 /*
504 * Not only that, but the size of the entire the snapshot
505 * header must also be representable in a size_t.
506 */
507 size -= snap_count * sizeof (__le64);
508 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
509 return false;
510
511 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500512}
513
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514/*
515 * Create a new header structure, translate header format from the on-disk
516 * header.
517 */
518static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500519 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700520{
Alex Elderccece232012-07-10 20:30:10 -0500521 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500522 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500523 size_t size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700524
Alex Elder6a523252012-07-19 17:12:59 -0500525 memset(header, 0, sizeof (*header));
526
Alex Elder103a1502012-08-02 11:29:45 -0500527 snap_count = le32_to_cpu(ondisk->snap_count);
528
Alex Elder58c17b02012-08-23 23:22:06 -0500529 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
530 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500531 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700532 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500533 memcpy(header->object_prefix, ondisk->object_prefix, len);
534 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600535
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700536 if (snap_count) {
Alex Elderccece232012-07-10 20:30:10 -0500537 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Alex Elder0f1d3f92012-08-02 11:29:44 -0500538 BUG_ON(header->snap_names_len > (u64) SIZE_MAX);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700539 header->snap_names = kmalloc(header->snap_names_len,
Alex Eldered63f4f2012-07-19 09:09:27 -0500540 GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700541 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500542 goto out_err;
543
Alex Elderd2bb24e2012-07-26 23:37:14 -0500544 size = snap_count * sizeof (*header->snap_sizes);
545 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700546 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500547 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700548 } else {
Alex Elderccece232012-07-10 20:30:10 -0500549 WARN_ON(ondisk->snap_names_len);
550 header->snap_names_len = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700551 header->snap_names = NULL;
552 header->snap_sizes = NULL;
553 }
Alex Elder849b4262012-07-09 21:04:24 -0500554
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700555 header->image_size = le64_to_cpu(ondisk->image_size);
556 header->obj_order = ondisk->options.order;
557 header->crypt_type = ondisk->options.crypt_type;
558 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500559 header->total_snaps = snap_count;
560
Alex Elder6a523252012-07-19 17:12:59 -0500561 size = sizeof (struct ceph_snap_context);
562 size += snap_count * sizeof (header->snapc->snaps[0]);
563 header->snapc = kzalloc(size, GFP_KERNEL);
564 if (!header->snapc)
565 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700566
567 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500568 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700569 header->snapc->num_snaps = snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570
Alex Elder28cb7752012-07-26 23:37:15 -0500571 /* Fill in the snapshot information */
572
573 if (snap_count) {
574 u32 i;
Alex Elderccece232012-07-10 20:30:10 -0500575
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700576 for (i = 0; i < snap_count; i++) {
577 header->snapc->snaps[i] =
578 le64_to_cpu(ondisk->snaps[i].id);
579 header->snap_sizes[i] =
580 le64_to_cpu(ondisk->snaps[i].image_size);
581 }
582
583 /* copy snapshot names */
Alex Elderccece232012-07-10 20:30:10 -0500584 memcpy(header->snap_names, &ondisk->snaps[snap_count],
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 header->snap_names_len);
586 }
587
588 return 0;
589
Alex Elder6a523252012-07-19 17:12:59 -0500590out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500591 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500592 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700593 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500594 header->snap_names = NULL;
Alex Elderd78fd7a2012-07-26 23:37:14 -0500595 header->snap_names_len = 0;
Alex Elder6a523252012-07-19 17:12:59 -0500596 kfree(header->object_prefix);
597 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500598
Alex Elder00f1f362012-02-07 12:03:36 -0600599 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600}
601
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700602static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
603 u64 *seq, u64 *size)
604{
605 int i;
606 char *p = header->snap_names;
607
Alex Elder00f1f362012-02-07 12:03:36 -0600608 for (i = 0; i < header->total_snaps; i++) {
609 if (!strcmp(snap_name, p)) {
610
611 /* Found it. Pass back its id and/or size */
612
613 if (seq)
614 *seq = header->snapc->snaps[i];
615 if (size)
616 *size = header->snap_sizes[i];
617 return i;
618 }
619 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700620 }
Alex Elder00f1f362012-02-07 12:03:36 -0600621 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622}
623
Alex Elder0ce1a792012-07-03 16:01:18 -0500624static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700625{
Alex Elder78dc4472012-07-19 08:49:18 -0500626 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700627
Alex Elder0ce1a792012-07-03 16:01:18 -0500628 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629
Alex Elder0ce1a792012-07-03 16:01:18 -0500630 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800631 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500632 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800633 rbd_dev->snap_exists = false;
Alex Elder0ce1a792012-07-03 16:01:18 -0500634 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700635 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500636 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700637 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500638 u64 snap_id = 0;
639
640 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
641 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 if (ret < 0)
643 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500644 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800645 rbd_dev->snap_exists = true;
Alex Elder0ce1a792012-07-03 16:01:18 -0500646 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700647 }
648
649 ret = 0;
650done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500651 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652 return ret;
653}
654
655static void rbd_header_free(struct rbd_image_header *header)
656{
Alex Elder849b4262012-07-09 21:04:24 -0500657 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500658 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500660 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500661 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500662 header->snap_names = NULL;
663 header->snap_names_len = 0;
Josh Durgind1d25642011-12-05 14:03:05 -0800664 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500665 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666}
667
668/*
669 * get the actual striped segment name, offset and length
670 */
671static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500672 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673 u64 ofs, u64 len,
674 char *seg_name, u64 *segofs)
675{
676 u64 seg = ofs >> header->obj_order;
677
678 if (seg_name)
679 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500680 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681
682 ofs = ofs & ((1 << header->obj_order) - 1);
683 len = min_t(u64, len, (1 << header->obj_order) - ofs);
684
685 if (segofs)
686 *segofs = ofs;
687
688 return len;
689}
690
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700691static int rbd_get_num_segments(struct rbd_image_header *header,
692 u64 ofs, u64 len)
693{
694 u64 start_seg = ofs >> header->obj_order;
695 u64 end_seg = (ofs + len - 1) >> header->obj_order;
696 return end_seg - start_seg + 1;
697}
698
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700700 * returns the size of an object in the image
701 */
702static u64 rbd_obj_bytes(struct rbd_image_header *header)
703{
704 return 1 << header->obj_order;
705}
706
707/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708 * bio helpers
709 */
710
711static void bio_chain_put(struct bio *chain)
712{
713 struct bio *tmp;
714
715 while (chain) {
716 tmp = chain;
717 chain = chain->bi_next;
718 bio_put(tmp);
719 }
720}
721
722/*
723 * zeros a bio chain, starting at specific offset
724 */
725static void zero_bio_chain(struct bio *chain, int start_ofs)
726{
727 struct bio_vec *bv;
728 unsigned long flags;
729 void *buf;
730 int i;
731 int pos = 0;
732
733 while (chain) {
734 bio_for_each_segment(bv, chain, i) {
735 if (pos + bv->bv_len > start_ofs) {
736 int remainder = max(start_ofs - pos, 0);
737 buf = bvec_kmap_irq(bv, &flags);
738 memset(buf + remainder, 0,
739 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200740 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700741 }
742 pos += bv->bv_len;
743 }
744
745 chain = chain->bi_next;
746 }
747}
748
749/*
750 * bio_chain_clone - clone a chain of bios up to a certain length.
751 * might return a bio_pair that will need to be released.
752 */
753static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
754 struct bio_pair **bp,
755 int len, gfp_t gfpmask)
756{
757 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
758 int total = 0;
759
760 if (*bp) {
761 bio_pair_release(*bp);
762 *bp = NULL;
763 }
764
765 while (old_chain && (total < len)) {
766 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
767 if (!tmp)
768 goto err_out;
769
770 if (total + old_chain->bi_size > len) {
771 struct bio_pair *bp;
772
773 /*
774 * this split can only happen with a single paged bio,
775 * split_bio will BUG_ON if this is not the case
776 */
777 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500778 "bi_size=%u\n",
779 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700780
781 /* split the bio. We'll release it either in the next
782 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600783 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 if (!bp)
785 goto err_out;
786
787 __bio_clone(tmp, &bp->bio1);
788
789 *next = &bp->bio2;
790 } else {
791 __bio_clone(tmp, old_chain);
792 *next = old_chain->bi_next;
793 }
794
795 tmp->bi_bdev = NULL;
796 gfpmask &= ~__GFP_WAIT;
797 tmp->bi_next = NULL;
798
799 if (!new_chain) {
800 new_chain = tail = tmp;
801 } else {
802 tail->bi_next = tmp;
803 tail = tmp;
804 }
805 old_chain = old_chain->bi_next;
806
807 total += tmp->bi_size;
808 }
809
810 BUG_ON(total < len);
811
812 if (tail)
813 tail->bi_next = NULL;
814
815 *old = old_chain;
816
817 return new_chain;
818
819err_out:
820 dout("bio_chain_clone with err\n");
821 bio_chain_put(new_chain);
822 return NULL;
823}
824
825/*
826 * helpers for osd request op vectors.
827 */
Alex Elder57cfc102012-06-26 12:57:03 -0700828static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
829 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700830{
Alex Elder57cfc102012-06-26 12:57:03 -0700831 struct ceph_osd_req_op *ops;
832
833 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
834 if (!ops)
835 return NULL;
836
837 ops[0].op = opcode;
838
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700839 /*
840 * op extent offset and length will be set later on
841 * in calc_raw_layout()
842 */
Alex Elder57cfc102012-06-26 12:57:03 -0700843 ops[0].payload_len = payload_len;
844
845 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846}
847
848static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
849{
850 kfree(ops);
851}
852
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700853static void rbd_coll_end_req_index(struct request *rq,
854 struct rbd_req_coll *coll,
855 int index,
856 int ret, u64 len)
857{
858 struct request_queue *q;
859 int min, max, i;
860
Alex Elderbd919d42012-07-13 20:35:11 -0500861 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
862 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700863
864 if (!rq)
865 return;
866
867 if (!coll) {
868 blk_end_request(rq, ret, len);
869 return;
870 }
871
872 q = rq->q;
873
874 spin_lock_irq(q->queue_lock);
875 coll->status[index].done = 1;
876 coll->status[index].rc = ret;
877 coll->status[index].bytes = len;
878 max = min = coll->num_done;
879 while (max < coll->total && coll->status[max].done)
880 max++;
881
882 for (i = min; i<max; i++) {
883 __blk_end_request(rq, coll->status[i].rc,
884 coll->status[i].bytes);
885 coll->num_done++;
886 kref_put(&coll->kref, rbd_coll_release);
887 }
888 spin_unlock_irq(q->queue_lock);
889}
890
891static void rbd_coll_end_req(struct rbd_request *req,
892 int ret, u64 len)
893{
894 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
895}
896
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700897/*
898 * Send ceph osd request
899 */
900static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500901 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700902 struct ceph_snap_context *snapc,
903 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500904 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700905 struct bio *bio,
906 struct page **pages,
907 int num_pages,
908 int flags,
909 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700910 struct rbd_req_coll *coll,
911 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700912 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700913 struct ceph_msg *msg),
914 struct ceph_osd_request **linger_req,
915 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700916{
917 struct ceph_osd_request *req;
918 struct ceph_file_layout *layout;
919 int ret;
920 u64 bno;
921 struct timespec mtime = CURRENT_TIME;
922 struct rbd_request *req_data;
923 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600924 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700926 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700927 if (!req_data) {
928 if (coll)
929 rbd_coll_end_req_index(rq, coll, coll_index,
930 -ENOMEM, len);
931 return -ENOMEM;
932 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700933
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700934 if (coll) {
935 req_data->coll = coll;
936 req_data->coll_index = coll_index;
937 }
938
Alex Elderbd919d42012-07-13 20:35:11 -0500939 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
940 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700941
Alex Elder0ce1a792012-07-03 16:01:18 -0500942 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600943 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
944 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700945 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700946 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947 goto done_pages;
948 }
949
950 req->r_callback = rbd_cb;
951
952 req_data->rq = rq;
953 req_data->bio = bio;
954 req_data->pages = pages;
955 req_data->len = len;
956
957 req->r_priv = req_data;
958
959 reqhead = req->r_request->front.iov_base;
960 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
961
Alex Elderaded07e2012-07-03 16:01:18 -0500962 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963 req->r_oid_len = strlen(req->r_oid);
964
965 layout = &req->r_file_layout;
966 memset(layout, 0, sizeof(*layout));
967 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
968 layout->fl_stripe_count = cpu_to_le32(1);
969 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500970 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600971 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
972 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700973
974 ceph_osdc_build_request(req, ofs, &len,
975 ops,
976 snapc,
977 &mtime,
978 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700979
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700980 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600981 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700982 *linger_req = req;
983 }
984
Alex Elder1dbb4392012-01-24 10:08:37 -0600985 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700986 if (ret < 0)
987 goto done_err;
988
989 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600990 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700991 if (ver)
992 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -0500993 dout("reassert_ver=%llu\n",
994 (unsigned long long)
995 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700996 ceph_osdc_put_request(req);
997 }
998 return ret;
999
1000done_err:
1001 bio_chain_put(req_data->bio);
1002 ceph_osdc_put_request(req);
1003done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001004 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001006 return ret;
1007}
1008
1009/*
1010 * Ceph osd op callback
1011 */
1012static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1013{
1014 struct rbd_request *req_data = req->r_priv;
1015 struct ceph_osd_reply_head *replyhead;
1016 struct ceph_osd_op *op;
1017 __s32 rc;
1018 u64 bytes;
1019 int read_op;
1020
1021 /* parse reply */
1022 replyhead = msg->front.iov_base;
1023 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1024 op = (void *)(replyhead + 1);
1025 rc = le32_to_cpu(replyhead->result);
1026 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001027 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028
Alex Elderbd919d42012-07-13 20:35:11 -05001029 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1030 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031
1032 if (rc == -ENOENT && read_op) {
1033 zero_bio_chain(req_data->bio, 0);
1034 rc = 0;
1035 } else if (rc == 0 && read_op && bytes < req_data->len) {
1036 zero_bio_chain(req_data->bio, bytes);
1037 bytes = req_data->len;
1038 }
1039
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001040 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001041
1042 if (req_data->bio)
1043 bio_chain_put(req_data->bio);
1044
1045 ceph_osdc_put_request(req);
1046 kfree(req_data);
1047}
1048
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001049static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1050{
1051 ceph_osdc_put_request(req);
1052}
1053
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054/*
1055 * Do a synchronous ceph osd operation
1056 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001057static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058 struct ceph_snap_context *snapc,
1059 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001060 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001061 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001062 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001063 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001064 char *buf,
1065 struct ceph_osd_request **linger_req,
1066 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001067{
1068 int ret;
1069 struct page **pages;
1070 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001071
1072 BUG_ON(ops == NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001073
1074 num_pages = calc_pages_for(ofs , len);
1075 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001076 if (IS_ERR(pages))
1077 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001078
Alex Elder0ce1a792012-07-03 16:01:18 -05001079 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001080 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001081 pages, num_pages,
1082 flags,
1083 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001084 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001085 NULL,
1086 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001087 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001088 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001089
1090 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1091 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1092
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001093done:
1094 ceph_release_page_vector(pages, num_pages);
1095 return ret;
1096}
1097
1098/*
1099 * Do an asynchronous ceph osd operation
1100 */
1101static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001102 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103 struct ceph_snap_context *snapc,
1104 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001105 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001107 struct bio *bio,
1108 struct rbd_req_coll *coll,
1109 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001110{
1111 char *seg_name;
1112 u64 seg_ofs;
1113 u64 seg_len;
1114 int ret;
1115 struct ceph_osd_req_op *ops;
1116 u32 payload_len;
1117
1118 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1119 if (!seg_name)
1120 return -ENOMEM;
1121
1122 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001123 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001124 ofs, len,
1125 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001126
1127 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1128
Alex Elder57cfc102012-06-26 12:57:03 -07001129 ret = -ENOMEM;
1130 ops = rbd_create_rw_ops(1, opcode, payload_len);
1131 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001132 goto done;
1133
1134 /* we've taken care of segment sizes earlier when we
1135 cloned the bios. We should never have a segment
1136 truncated at this point */
1137 BUG_ON(seg_len < len);
1138
1139 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1140 seg_name, seg_ofs, seg_len,
1141 bio,
1142 NULL, 0,
1143 flags,
1144 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001145 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001146 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001147
1148 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149done:
1150 kfree(seg_name);
1151 return ret;
1152}
1153
1154/*
1155 * Request async osd write
1156 */
1157static int rbd_req_write(struct request *rq,
1158 struct rbd_device *rbd_dev,
1159 struct ceph_snap_context *snapc,
1160 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001161 struct bio *bio,
1162 struct rbd_req_coll *coll,
1163 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164{
1165 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1166 CEPH_OSD_OP_WRITE,
1167 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001168 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001169}
1170
1171/*
1172 * Request async osd read
1173 */
1174static int rbd_req_read(struct request *rq,
1175 struct rbd_device *rbd_dev,
1176 u64 snapid,
1177 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001178 struct bio *bio,
1179 struct rbd_req_coll *coll,
1180 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181{
1182 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001183 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001184 CEPH_OSD_OP_READ,
1185 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001186 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001187}
1188
1189/*
1190 * Request sync osd read
1191 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001192static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001193 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001194 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001195 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001196 char *buf,
1197 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001198{
Alex Elder913d2fd2012-06-26 12:57:03 -07001199 struct ceph_osd_req_op *ops;
1200 int ret;
1201
1202 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1203 if (!ops)
1204 return -ENOMEM;
1205
1206 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001207 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001209 ops, object_name, ofs, len, buf, NULL, ver);
1210 rbd_destroy_ops(ops);
1211
1212 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001213}
1214
1215/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001216 * Request sync osd watch
1217 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001218static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001219 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001220 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001221{
1222 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001223 int ret;
1224
Alex Elder57cfc102012-06-26 12:57:03 -07001225 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1226 if (!ops)
1227 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001228
Josh Durgina71b8912011-12-05 18:10:44 -08001229 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 ops[0].watch.cookie = notify_id;
1231 ops[0].watch.flag = 0;
1232
Alex Elder0ce1a792012-07-03 16:01:18 -05001233 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001234 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001235 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001236 CEPH_OSD_FLAG_READ,
1237 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001238 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001239 rbd_simple_req_cb, 0, NULL);
1240
1241 rbd_destroy_ops(ops);
1242 return ret;
1243}
1244
1245static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1246{
Alex Elder0ce1a792012-07-03 16:01:18 -05001247 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001248 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001249 int rc;
1250
Alex Elder0ce1a792012-07-03 16:01:18 -05001251 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001252 return;
1253
Alex Elderbd919d42012-07-13 20:35:11 -05001254 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1255 rbd_dev->header_name, (unsigned long long) notify_id,
1256 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001257 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001258 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001259 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001260 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001261
Alex Elder7f0a24d2012-07-25 09:32:40 -05001262 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001263}
1264
1265/*
1266 * Request sync osd watch
1267 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001268static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269{
1270 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001271 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001272 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001273
Alex Elder57cfc102012-06-26 12:57:03 -07001274 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1275 if (!ops)
1276 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001277
1278 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001279 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001280 if (ret < 0)
1281 goto fail;
1282
Alex Elder0e6f3222012-07-25 09:32:40 -05001283 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001284 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001285 ops[0].watch.flag = 1;
1286
Alex Elder0ce1a792012-07-03 16:01:18 -05001287 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001288 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001289 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1290 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001291 rbd_dev->header_name,
1292 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001293 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001294
1295 if (ret < 0)
1296 goto fail_event;
1297
1298 rbd_destroy_ops(ops);
1299 return 0;
1300
1301fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001302 ceph_osdc_cancel_event(rbd_dev->watch_event);
1303 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001304fail:
1305 rbd_destroy_ops(ops);
1306 return ret;
1307}
1308
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001309/*
1310 * Request sync osd unwatch
1311 */
Alex Elder070c6332012-07-25 09:32:41 -05001312static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001313{
1314 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001315 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001316
Alex Elder57cfc102012-06-26 12:57:03 -07001317 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1318 if (!ops)
1319 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001320
1321 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001322 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001323 ops[0].watch.flag = 0;
1324
Alex Elder0ce1a792012-07-03 16:01:18 -05001325 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001326 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001327 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1328 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001329 rbd_dev->header_name,
1330 0, 0, NULL, NULL, NULL);
1331
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001332
1333 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001334 ceph_osdc_cancel_event(rbd_dev->watch_event);
1335 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001336 return ret;
1337}
1338
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001339struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001340 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341};
1342
1343static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1344{
Alex Elder0ce1a792012-07-03 16:01:18 -05001345 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1346 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001347 return;
1348
Alex Elderbd919d42012-07-13 20:35:11 -05001349 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1350 rbd_dev->header_name, (unsigned long long) notify_id,
1351 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001352}
1353
1354/*
1355 * Request sync osd notify
1356 */
Alex Elder4cb16252012-07-25 09:32:40 -05001357static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001358{
1359 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001360 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001361 struct ceph_osd_event *event;
1362 struct rbd_notify_info info;
1363 int payload_len = sizeof(u32) + sizeof(u32);
1364 int ret;
1365
Alex Elder57cfc102012-06-26 12:57:03 -07001366 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1367 if (!ops)
1368 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001369
Alex Elder0ce1a792012-07-03 16:01:18 -05001370 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001371
1372 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1373 (void *)&info, &event);
1374 if (ret < 0)
1375 goto fail;
1376
1377 ops[0].watch.ver = 1;
1378 ops[0].watch.flag = 1;
1379 ops[0].watch.cookie = event->cookie;
1380 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1381 ops[0].watch.timeout = 12;
1382
Alex Elder0ce1a792012-07-03 16:01:18 -05001383 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001384 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001385 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1386 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001387 rbd_dev->header_name,
1388 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001389 if (ret < 0)
1390 goto fail_event;
1391
1392 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1393 dout("ceph_osdc_wait_event returned %d\n", ret);
1394 rbd_destroy_ops(ops);
1395 return 0;
1396
1397fail_event:
1398 ceph_osdc_cancel_event(event);
1399fail:
1400 rbd_destroy_ops(ops);
1401 return ret;
1402}
1403
1404/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001405 * Request sync osd read
1406 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001407static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001408 const char *object_name,
1409 const char *class_name,
1410 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001411 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001412 int len,
1413 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001414{
1415 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001416 int class_name_len = strlen(class_name);
1417 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001418 int ret;
1419
1420 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001421 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001422 if (!ops)
1423 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001424
Alex Elderaded07e2012-07-03 16:01:18 -05001425 ops[0].cls.class_name = class_name;
1426 ops[0].cls.class_len = (__u8) class_name_len;
1427 ops[0].cls.method_name = method_name;
1428 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001429 ops[0].cls.argc = 0;
1430 ops[0].cls.indata = data;
1431 ops[0].cls.indata_len = len;
1432
Alex Elder0ce1a792012-07-03 16:01:18 -05001433 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001434 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001435 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1436 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001437 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001438
1439 rbd_destroy_ops(ops);
1440
1441 dout("cls_exec returned %d\n", ret);
1442 return ret;
1443}
1444
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001445static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1446{
1447 struct rbd_req_coll *coll =
1448 kzalloc(sizeof(struct rbd_req_coll) +
1449 sizeof(struct rbd_req_status) * num_reqs,
1450 GFP_ATOMIC);
1451
1452 if (!coll)
1453 return NULL;
1454 coll->total = num_reqs;
1455 kref_init(&coll->kref);
1456 return coll;
1457}
1458
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001459/*
1460 * block device queue callback
1461 */
1462static void rbd_rq_fn(struct request_queue *q)
1463{
1464 struct rbd_device *rbd_dev = q->queuedata;
1465 struct request *rq;
1466 struct bio_pair *bp = NULL;
1467
Alex Elder00f1f362012-02-07 12:03:36 -06001468 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001469 struct bio *bio;
1470 struct bio *rq_bio, *next_bio = NULL;
1471 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001472 unsigned int size;
1473 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001475 int num_segs, cur_seg = 0;
1476 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001477 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001478
1479 /* peek at request from block layer */
1480 if (!rq)
1481 break;
1482
1483 dout("fetched request\n");
1484
1485 /* filter out block requests we don't understand */
1486 if ((rq->cmd_type != REQ_TYPE_FS)) {
1487 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001488 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001489 }
1490
1491 /* deduce our operation (read, write) */
1492 do_write = (rq_data_dir(rq) == WRITE);
1493
1494 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001495 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001496 rq_bio = rq->bio;
1497 if (do_write && rbd_dev->read_only) {
1498 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001499 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001500 }
1501
1502 spin_unlock_irq(q->queue_lock);
1503
Josh Durgind1d25642011-12-05 14:03:05 -08001504 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001505
Josh Durgind1d25642011-12-05 14:03:05 -08001506 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001507 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001508 dout("request for non-existent snapshot");
1509 spin_lock_irq(q->queue_lock);
1510 __blk_end_request_all(rq, -ENXIO);
1511 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001512 }
1513
Josh Durgind1d25642011-12-05 14:03:05 -08001514 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1515
1516 up_read(&rbd_dev->header_rwsem);
1517
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001518 dout("%s 0x%x bytes at 0x%llx\n",
1519 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001520 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001522 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1523 coll = rbd_alloc_coll(num_segs);
1524 if (!coll) {
1525 spin_lock_irq(q->queue_lock);
1526 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001527 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001528 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001529 }
1530
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001531 do {
1532 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001533 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001535 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001536 ofs, size,
1537 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001538 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1540 op_size, GFP_ATOMIC);
1541 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001542 rbd_coll_end_req_index(rq, coll, cur_seg,
1543 -ENOMEM, op_size);
1544 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001545 }
1546
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001547
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 /* init OSD command: write or read */
1549 if (do_write)
1550 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001551 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001552 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001553 op_size, bio,
1554 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555 else
1556 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001557 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001559 op_size, bio,
1560 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001561
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001562next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001563 size -= op_size;
1564 ofs += op_size;
1565
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001566 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567 rq_bio = next_bio;
1568 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001569 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001570
1571 if (bp)
1572 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001573 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001574
1575 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001576 }
1577}
1578
1579/*
1580 * a queue callback. Makes sure that we don't create a bio that spans across
1581 * multiple osd objects. One exception would be with a single page bios,
1582 * which we handle later at bio_chain_clone
1583 */
1584static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1585 struct bio_vec *bvec)
1586{
1587 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001588 unsigned int chunk_sectors;
1589 sector_t sector;
1590 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001591 int max;
1592
Alex Elder593a9e72012-02-07 12:03:37 -06001593 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1594 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1595 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1596
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001597 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001598 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001599 if (max < 0)
1600 max = 0; /* bio_add cannot handle a negative return */
1601 if (max <= bvec->bv_len && bio_sectors == 0)
1602 return bvec->bv_len;
1603 return max;
1604}
1605
1606static void rbd_free_disk(struct rbd_device *rbd_dev)
1607{
1608 struct gendisk *disk = rbd_dev->disk;
1609
1610 if (!disk)
1611 return;
1612
1613 rbd_header_free(&rbd_dev->header);
1614
1615 if (disk->flags & GENHD_FL_UP)
1616 del_gendisk(disk);
1617 if (disk->queue)
1618 blk_cleanup_queue(disk->queue);
1619 put_disk(disk);
1620}
1621
1622/*
Alex Elder4156d992012-08-02 11:29:46 -05001623 * Read the complete header for the given rbd device.
1624 *
1625 * Returns a pointer to a dynamically-allocated buffer containing
1626 * the complete and validated header. Caller can pass the address
1627 * of a variable that will be filled in with the version of the
1628 * header object at the time it was read.
1629 *
1630 * Returns a pointer-coded errno if a failure occurs.
1631 */
1632static struct rbd_image_header_ondisk *
1633rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1634{
1635 struct rbd_image_header_ondisk *ondisk = NULL;
1636 u32 snap_count = 0;
1637 u64 names_size = 0;
1638 u32 want_count;
1639 int ret;
1640
1641 /*
1642 * The complete header will include an array of its 64-bit
1643 * snapshot ids, followed by the names of those snapshots as
1644 * a contiguous block of NUL-terminated strings. Note that
1645 * the number of snapshots could change by the time we read
1646 * it in, in which case we re-read it.
1647 */
1648 do {
1649 size_t size;
1650
1651 kfree(ondisk);
1652
1653 size = sizeof (*ondisk);
1654 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1655 size += names_size;
1656 ondisk = kmalloc(size, GFP_KERNEL);
1657 if (!ondisk)
1658 return ERR_PTR(-ENOMEM);
1659
1660 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1661 rbd_dev->header_name,
1662 0, size,
1663 (char *) ondisk, version);
1664
1665 if (ret < 0)
1666 goto out_err;
1667 if (WARN_ON((size_t) ret < size)) {
1668 ret = -ENXIO;
1669 pr_warning("short header read for image %s"
1670 " (want %zd got %d)\n",
1671 rbd_dev->image_name, size, ret);
1672 goto out_err;
1673 }
1674 if (!rbd_dev_ondisk_valid(ondisk)) {
1675 ret = -ENXIO;
1676 pr_warning("invalid header for image %s\n",
1677 rbd_dev->image_name);
1678 goto out_err;
1679 }
1680
1681 names_size = le64_to_cpu(ondisk->snap_names_len);
1682 want_count = snap_count;
1683 snap_count = le32_to_cpu(ondisk->snap_count);
1684 } while (snap_count != want_count);
1685
1686 return ondisk;
1687
1688out_err:
1689 kfree(ondisk);
1690
1691 return ERR_PTR(ret);
1692}
1693
1694/*
1695 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001696 */
1697static int rbd_read_header(struct rbd_device *rbd_dev,
1698 struct rbd_image_header *header)
1699{
Alex Elder4156d992012-08-02 11:29:46 -05001700 struct rbd_image_header_ondisk *ondisk;
1701 u64 ver = 0;
1702 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001703
Alex Elder4156d992012-08-02 11:29:46 -05001704 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1705 if (IS_ERR(ondisk))
1706 return PTR_ERR(ondisk);
1707 ret = rbd_header_from_disk(header, ondisk);
1708 if (ret >= 0)
1709 header->obj_version = ver;
1710 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001711
Alex Elder4156d992012-08-02 11:29:46 -05001712 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001713}
1714
1715/*
1716 * create a snapshot
1717 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001718static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001719 const char *snap_name,
1720 gfp_t gfp_flags)
1721{
1722 int name_len = strlen(snap_name);
1723 u64 new_snapid;
1724 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001725 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001726 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001727
1728 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001729 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001730 return -EINVAL;
1731
Alex Elder0ce1a792012-07-03 16:01:18 -05001732 monc = &rbd_dev->rbd_client->client->monc;
1733 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001734 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001735 if (ret < 0)
1736 return ret;
1737
1738 data = kmalloc(name_len + 16, gfp_flags);
1739 if (!data)
1740 return -ENOMEM;
1741
Sage Weil916d4d62011-05-12 16:10:50 -07001742 p = data;
1743 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744
Sage Weil916d4d62011-05-12 16:10:50 -07001745 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1746 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001747
Alex Elder0bed54d2012-07-03 16:01:18 -05001748 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001749 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001750 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001751
Sage Weil916d4d62011-05-12 16:10:50 -07001752 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001753
Alex Elder505cbb92012-07-19 08:49:18 -05001754 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001755bad:
1756 return -ERANGE;
1757}
1758
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001759static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1760{
1761 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001762 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001763
Alex Eldera0593292012-07-19 09:09:27 -05001764 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001765 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001766}
1767
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001768/*
1769 * only read the first part of the ondisk header, without the snaps info
1770 */
Alex Elderb8136232012-07-25 09:32:41 -05001771static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001772{
1773 int ret;
1774 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775
1776 ret = rbd_read_header(rbd_dev, &h);
1777 if (ret < 0)
1778 return ret;
1779
Josh Durgina51aa0c2011-12-05 10:35:04 -08001780 down_write(&rbd_dev->header_rwsem);
1781
Sage Weil9db4b3e2011-04-19 22:49:06 -07001782 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001783 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1784 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1785
1786 dout("setting size to %llu sectors", (unsigned long long) size);
1787 set_capacity(rbd_dev->disk, size);
1788 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001789
Alex Elder849b4262012-07-09 21:04:24 -05001790 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001791 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001792 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001793 /* osd requests may still refer to snapc */
1794 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001795
Alex Elderb8136232012-07-25 09:32:41 -05001796 if (hver)
1797 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001798 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001799 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001800 rbd_dev->header.total_snaps = h.total_snaps;
1801 rbd_dev->header.snapc = h.snapc;
1802 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001803 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001804 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001805 /* Free the extra copy of the object prefix */
1806 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1807 kfree(h.object_prefix);
1808
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001809 ret = __rbd_init_snaps_header(rbd_dev);
1810
Josh Durginc6666012011-11-21 17:11:12 -08001811 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001813 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001814}
1815
Alex Elder1fe5e992012-07-25 09:32:41 -05001816static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1817{
1818 int ret;
1819
1820 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1821 ret = __rbd_refresh_header(rbd_dev, hver);
1822 mutex_unlock(&ctl_mutex);
1823
1824 return ret;
1825}
1826
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001827static int rbd_init_disk(struct rbd_device *rbd_dev)
1828{
1829 struct gendisk *disk;
1830 struct request_queue *q;
1831 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001832 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001833 u64 total_size = 0;
1834
1835 /* contact OSD, request size info about the object being mapped */
1836 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1837 if (rc)
1838 return rc;
1839
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001840 /* no need to lock here, as rbd_dev is not registered yet */
1841 rc = __rbd_init_snaps_header(rbd_dev);
1842 if (rc)
1843 return rc;
1844
Josh Durgincc9d7342011-11-21 18:19:13 -08001845 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001846 if (rc)
1847 return rc;
1848
1849 /* create gendisk info */
1850 rc = -ENOMEM;
1851 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1852 if (!disk)
1853 goto out;
1854
Alex Elderf0f8cef2012-01-29 13:57:44 -06001855 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001856 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001857 disk->major = rbd_dev->major;
1858 disk->first_minor = 0;
1859 disk->fops = &rbd_bd_ops;
1860 disk->private_data = rbd_dev;
1861
1862 /* init rq */
1863 rc = -ENOMEM;
1864 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1865 if (!q)
1866 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001867
Alex Elder593a9e72012-02-07 12:03:37 -06001868 /* We use the default size, but let's be explicit about it. */
1869 blk_queue_physical_block_size(q, SECTOR_SIZE);
1870
Josh Durgin029bcbd2011-07-22 11:35:23 -07001871 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001872 segment_size = rbd_obj_bytes(&rbd_dev->header);
1873 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1874 blk_queue_max_segment_size(q, segment_size);
1875 blk_queue_io_min(q, segment_size);
1876 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001877
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001878 blk_queue_merge_bvec(q, rbd_merge_bvec);
1879 disk->queue = q;
1880
1881 q->queuedata = rbd_dev;
1882
1883 rbd_dev->disk = disk;
1884 rbd_dev->q = q;
1885
1886 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001887 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001888 add_disk(disk);
1889
1890 pr_info("%s: added with size 0x%llx\n",
1891 disk->disk_name, (unsigned long long)total_size);
1892 return 0;
1893
1894out_disk:
1895 put_disk(disk);
1896out:
1897 return rc;
1898}
1899
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001900/*
1901 sysfs
1902*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001903
Alex Elder593a9e72012-02-07 12:03:37 -06001904static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1905{
1906 return container_of(dev, struct rbd_device, dev);
1907}
1908
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001909static ssize_t rbd_size_show(struct device *dev,
1910 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001911{
Alex Elder593a9e72012-02-07 12:03:37 -06001912 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001913 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001914
Josh Durgina51aa0c2011-12-05 10:35:04 -08001915 down_read(&rbd_dev->header_rwsem);
1916 size = get_capacity(rbd_dev->disk);
1917 up_read(&rbd_dev->header_rwsem);
1918
1919 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001920}
1921
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001922static ssize_t rbd_major_show(struct device *dev,
1923 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001924{
Alex Elder593a9e72012-02-07 12:03:37 -06001925 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001926
1927 return sprintf(buf, "%d\n", rbd_dev->major);
1928}
1929
1930static ssize_t rbd_client_id_show(struct device *dev,
1931 struct device_attribute *attr, char *buf)
1932{
Alex Elder593a9e72012-02-07 12:03:37 -06001933 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001934
Alex Elder1dbb4392012-01-24 10:08:37 -06001935 return sprintf(buf, "client%lld\n",
1936 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001937}
1938
1939static ssize_t rbd_pool_show(struct device *dev,
1940 struct device_attribute *attr, char *buf)
1941{
Alex Elder593a9e72012-02-07 12:03:37 -06001942 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001943
1944 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1945}
1946
Alex Elder9bb2f332012-07-12 10:46:35 -05001947static ssize_t rbd_pool_id_show(struct device *dev,
1948 struct device_attribute *attr, char *buf)
1949{
1950 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1951
1952 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1953}
1954
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001955static ssize_t rbd_name_show(struct device *dev,
1956 struct device_attribute *attr, char *buf)
1957{
Alex Elder593a9e72012-02-07 12:03:37 -06001958 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001959
Alex Elder0bed54d2012-07-03 16:01:18 -05001960 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001961}
1962
1963static ssize_t rbd_snap_show(struct device *dev,
1964 struct device_attribute *attr,
1965 char *buf)
1966{
Alex Elder593a9e72012-02-07 12:03:37 -06001967 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001968
1969 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1970}
1971
1972static ssize_t rbd_image_refresh(struct device *dev,
1973 struct device_attribute *attr,
1974 const char *buf,
1975 size_t size)
1976{
Alex Elder593a9e72012-02-07 12:03:37 -06001977 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001978 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001979
Alex Elder1fe5e992012-07-25 09:32:41 -05001980 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001981
1982 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001983}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001984
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001985static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1986static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1987static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1988static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001989static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001990static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1991static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1992static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1993static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001994
1995static struct attribute *rbd_attrs[] = {
1996 &dev_attr_size.attr,
1997 &dev_attr_major.attr,
1998 &dev_attr_client_id.attr,
1999 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002000 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002001 &dev_attr_name.attr,
2002 &dev_attr_current_snap.attr,
2003 &dev_attr_refresh.attr,
2004 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002005 NULL
2006};
2007
2008static struct attribute_group rbd_attr_group = {
2009 .attrs = rbd_attrs,
2010};
2011
2012static const struct attribute_group *rbd_attr_groups[] = {
2013 &rbd_attr_group,
2014 NULL
2015};
2016
2017static void rbd_sysfs_dev_release(struct device *dev)
2018{
2019}
2020
2021static struct device_type rbd_device_type = {
2022 .name = "rbd",
2023 .groups = rbd_attr_groups,
2024 .release = rbd_sysfs_dev_release,
2025};
2026
2027
2028/*
2029 sysfs - snapshots
2030*/
2031
2032static ssize_t rbd_snap_size_show(struct device *dev,
2033 struct device_attribute *attr,
2034 char *buf)
2035{
2036 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2037
Josh Durgin35915382011-12-05 18:25:13 -08002038 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002039}
2040
2041static ssize_t rbd_snap_id_show(struct device *dev,
2042 struct device_attribute *attr,
2043 char *buf)
2044{
2045 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2046
Josh Durgin35915382011-12-05 18:25:13 -08002047 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002048}
2049
2050static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2051static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2052
2053static struct attribute *rbd_snap_attrs[] = {
2054 &dev_attr_snap_size.attr,
2055 &dev_attr_snap_id.attr,
2056 NULL,
2057};
2058
2059static struct attribute_group rbd_snap_attr_group = {
2060 .attrs = rbd_snap_attrs,
2061};
2062
2063static void rbd_snap_dev_release(struct device *dev)
2064{
2065 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2066 kfree(snap->name);
2067 kfree(snap);
2068}
2069
2070static const struct attribute_group *rbd_snap_attr_groups[] = {
2071 &rbd_snap_attr_group,
2072 NULL
2073};
2074
2075static struct device_type rbd_snap_device_type = {
2076 .groups = rbd_snap_attr_groups,
2077 .release = rbd_snap_dev_release,
2078};
2079
Alex Elder14e70852012-07-19 09:09:27 -05002080static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002081{
2082 list_del(&snap->node);
2083 device_unregister(&snap->dev);
2084}
2085
Alex Elder14e70852012-07-19 09:09:27 -05002086static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002087 struct device *parent)
2088{
2089 struct device *dev = &snap->dev;
2090 int ret;
2091
2092 dev->type = &rbd_snap_device_type;
2093 dev->parent = parent;
2094 dev->release = rbd_snap_dev_release;
2095 dev_set_name(dev, "snap_%s", snap->name);
2096 ret = device_register(dev);
2097
2098 return ret;
2099}
2100
Alex Elder4e891e02012-07-10 20:30:10 -05002101static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2102 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002103{
Alex Elder4e891e02012-07-10 20:30:10 -05002104 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002106
2107 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002108 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002109 return ERR_PTR(-ENOMEM);
2110
2111 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002112 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002113 if (!snap->name)
2114 goto err;
2115
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002116 snap->size = rbd_dev->header.snap_sizes[i];
2117 snap->id = rbd_dev->header.snapc->snaps[i];
2118 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002119 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002120 if (ret < 0)
2121 goto err;
2122 }
Alex Elder4e891e02012-07-10 20:30:10 -05002123
2124 return snap;
2125
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002126err:
2127 kfree(snap->name);
2128 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002129
2130 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002131}
2132
2133/*
Alex Elder35938152012-08-02 11:29:46 -05002134 * Scan the rbd device's current snapshot list and compare it to the
2135 * newly-received snapshot context. Remove any existing snapshots
2136 * not present in the new snapshot context. Add a new snapshot for
2137 * any snaphots in the snapshot context not in the current list.
2138 * And verify there are no changes to snapshots we already know
2139 * about.
2140 *
2141 * Assumes the snapshots in the snapshot context are sorted by
2142 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2143 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002144 */
2145static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2146{
Alex Elder35938152012-08-02 11:29:46 -05002147 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2148 const u32 snap_count = snapc->num_snaps;
2149 char *snap_name = rbd_dev->header.snap_names;
2150 struct list_head *head = &rbd_dev->snaps;
2151 struct list_head *links = head->next;
2152 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002153
Alex Elder35938152012-08-02 11:29:46 -05002154 while (index < snap_count || links != head) {
2155 u64 snap_id;
2156 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002157
Alex Elder35938152012-08-02 11:29:46 -05002158 snap_id = index < snap_count ? snapc->snaps[index]
2159 : CEPH_NOSNAP;
2160 snap = links != head ? list_entry(links, struct rbd_snap, node)
2161 : NULL;
2162 BUG_ON(snap && snap->id == CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002163
Alex Elder35938152012-08-02 11:29:46 -05002164 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2165 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166
Alex Elder35938152012-08-02 11:29:46 -05002167 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002168
Alex Elder35938152012-08-02 11:29:46 -05002169 if (rbd_dev->snap_id == snap->id)
Josh Durgine88a36e2011-11-21 18:14:25 -08002170 rbd_dev->snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002171 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002172
Alex Elder35938152012-08-02 11:29:46 -05002173 /* Done with this list entry; advance */
2174
2175 links = next;
2176 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002177 }
Alex Elder35938152012-08-02 11:29:46 -05002178
2179 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2180 struct rbd_snap *new_snap;
2181
2182 /* We haven't seen this snapshot before */
2183
2184 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2185 snap_name);
2186 if (IS_ERR(new_snap))
2187 return PTR_ERR(new_snap);
2188
2189 /* New goes before existing, or at end of list */
2190
2191 if (snap)
2192 list_add_tail(&new_snap->node, &snap->node);
2193 else
Alex Elder523f3252012-08-30 00:16:37 -05002194 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002195 } else {
2196 /* Already have this one */
2197
2198 BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]);
2199 BUG_ON(strcmp(snap->name, snap_name));
2200
2201 /* Done with this list entry; advance */
2202
2203 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002204 }
Alex Elder35938152012-08-02 11:29:46 -05002205
2206 /* Advance to the next entry in the snapshot context */
2207
2208 index++;
2209 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002210 }
2211
2212 return 0;
2213}
2214
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002215static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2216{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002217 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002218 struct device *dev;
2219 struct rbd_snap *snap;
2220
2221 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2222 dev = &rbd_dev->dev;
2223
2224 dev->bus = &rbd_bus_type;
2225 dev->type = &rbd_device_type;
2226 dev->parent = &rbd_root_dev;
2227 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002228 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002229 ret = device_register(dev);
2230 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002231 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002232
2233 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002234 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002235 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002236 break;
2237 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002238out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002239 mutex_unlock(&ctl_mutex);
2240 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002241}
2242
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002243static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2244{
2245 device_unregister(&rbd_dev->dev);
2246}
2247
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002248static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2249{
2250 int ret, rc;
2251
2252 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002253 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002254 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002255 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002256 if (rc < 0)
2257 return rc;
2258 }
2259 } while (ret == -ERANGE);
2260
2261 return ret;
2262}
2263
Alex Elder1ddbe942012-01-29 13:57:44 -06002264static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2265
2266/*
Alex Elder499afd52012-02-02 08:13:29 -06002267 * Get a unique rbd identifier for the given new rbd_dev, and add
2268 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002269 */
Alex Elder499afd52012-02-02 08:13:29 -06002270static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002271{
Alex Elderde71a292012-07-03 16:01:19 -05002272 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002273
2274 spin_lock(&rbd_dev_list_lock);
2275 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2276 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002277}
Alex Elderb7f23c32012-01-29 13:57:43 -06002278
Alex Elder1ddbe942012-01-29 13:57:44 -06002279/*
Alex Elder499afd52012-02-02 08:13:29 -06002280 * Remove an rbd_dev from the global list, and record that its
2281 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002282 */
Alex Elder499afd52012-02-02 08:13:29 -06002283static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002284{
Alex Elderd184f6b2012-01-29 13:57:44 -06002285 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002286 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002287 int max_id;
2288
2289 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002290
2291 spin_lock(&rbd_dev_list_lock);
2292 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002293
2294 /*
2295 * If the id being "put" is not the current maximum, there
2296 * is nothing special we need to do.
2297 */
2298 if (rbd_id != atomic64_read(&rbd_id_max)) {
2299 spin_unlock(&rbd_dev_list_lock);
2300 return;
2301 }
2302
2303 /*
2304 * We need to update the current maximum id. Search the
2305 * list to find out what it is. We're more likely to find
2306 * the maximum at the end, so search the list backward.
2307 */
2308 max_id = 0;
2309 list_for_each_prev(tmp, &rbd_dev_list) {
2310 struct rbd_device *rbd_dev;
2311
2312 rbd_dev = list_entry(tmp, struct rbd_device, node);
2313 if (rbd_id > max_id)
2314 max_id = rbd_id;
2315 }
Alex Elder499afd52012-02-02 08:13:29 -06002316 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002317
Alex Elder1ddbe942012-01-29 13:57:44 -06002318 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002319 * The max id could have been updated by rbd_id_get(), in
2320 * which case it now accurately reflects the new maximum.
2321 * Be careful not to overwrite the maximum value in that
2322 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002323 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002324 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002325}
2326
Alex Eldera725f65e2012-02-02 08:13:30 -06002327/*
Alex Eldere28fff262012-02-02 08:13:30 -06002328 * Skips over white space at *buf, and updates *buf to point to the
2329 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002330 * the token (string of non-white space characters) found. Note
2331 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002332 */
2333static inline size_t next_token(const char **buf)
2334{
2335 /*
2336 * These are the characters that produce nonzero for
2337 * isspace() in the "C" and "POSIX" locales.
2338 */
2339 const char *spaces = " \f\n\r\t\v";
2340
2341 *buf += strspn(*buf, spaces); /* Find start of token */
2342
2343 return strcspn(*buf, spaces); /* Return token length */
2344}
2345
2346/*
2347 * Finds the next token in *buf, and if the provided token buffer is
2348 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002349 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2350 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002351 *
2352 * Returns the length of the token found (not including the '\0').
2353 * Return value will be 0 if no token is found, and it will be >=
2354 * token_size if the token would not fit.
2355 *
Alex Elder593a9e72012-02-07 12:03:37 -06002356 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002357 * found token. Note that this occurs even if the token buffer is
2358 * too small to hold it.
2359 */
2360static inline size_t copy_token(const char **buf,
2361 char *token,
2362 size_t token_size)
2363{
2364 size_t len;
2365
2366 len = next_token(buf);
2367 if (len < token_size) {
2368 memcpy(token, *buf, len);
2369 *(token + len) = '\0';
2370 }
2371 *buf += len;
2372
2373 return len;
2374}
2375
2376/*
Alex Elderea3352f2012-07-09 21:04:23 -05002377 * Finds the next token in *buf, dynamically allocates a buffer big
2378 * enough to hold a copy of it, and copies the token into the new
2379 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2380 * that a duplicate buffer is created even for a zero-length token.
2381 *
2382 * Returns a pointer to the newly-allocated duplicate, or a null
2383 * pointer if memory for the duplicate was not available. If
2384 * the lenp argument is a non-null pointer, the length of the token
2385 * (not including the '\0') is returned in *lenp.
2386 *
2387 * If successful, the *buf pointer will be updated to point beyond
2388 * the end of the found token.
2389 *
2390 * Note: uses GFP_KERNEL for allocation.
2391 */
2392static inline char *dup_token(const char **buf, size_t *lenp)
2393{
2394 char *dup;
2395 size_t len;
2396
2397 len = next_token(buf);
2398 dup = kmalloc(len + 1, GFP_KERNEL);
2399 if (!dup)
2400 return NULL;
2401
2402 memcpy(dup, *buf, len);
2403 *(dup + len) = '\0';
2404 *buf += len;
2405
2406 if (lenp)
2407 *lenp = len;
2408
2409 return dup;
2410}
2411
2412/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002413 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002414 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2415 * on the list of monitor addresses and other options provided via
2416 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002417 *
2418 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002419 */
2420static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2421 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002422 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002423 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002424 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002425 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002426{
Alex Elderd22f76e2012-07-12 10:46:35 -05002427 size_t len;
2428 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002429
2430 /* The first four tokens are required */
2431
Alex Elder7ef32142012-02-02 08:13:30 -06002432 len = next_token(&buf);
2433 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002434 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002435 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002436 *mon_addrs = buf;
2437
2438 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002439
Alex Eldere28fff262012-02-02 08:13:30 -06002440 len = copy_token(&buf, options, options_size);
2441 if (!len || len >= options_size)
2442 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002443
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002444 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002445 rbd_dev->pool_name = dup_token(&buf, NULL);
2446 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002447 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002448
Alex Elder0bed54d2012-07-03 16:01:18 -05002449 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2450 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002451 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002452
Alex Eldercb8627c2012-07-09 21:04:23 -05002453 /* Create the name of the header object */
2454
Alex Elder0bed54d2012-07-03 16:01:18 -05002455 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002456 + sizeof (RBD_SUFFIX),
2457 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002458 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002459 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002460 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002461
Alex Eldere28fff262012-02-02 08:13:30 -06002462 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002463 * The snapshot name is optional. If none is is supplied,
2464 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002465 */
Alex Elder820a5f32012-07-09 21:04:24 -05002466 rbd_dev->snap_name = dup_token(&buf, &len);
2467 if (!rbd_dev->snap_name)
2468 goto out_err;
2469 if (!len) {
2470 /* Replace the empty name with the default */
2471 kfree(rbd_dev->snap_name);
2472 rbd_dev->snap_name
2473 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2474 if (!rbd_dev->snap_name)
2475 goto out_err;
2476
Alex Eldere28fff262012-02-02 08:13:30 -06002477 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2478 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002479 }
Alex Eldere28fff262012-02-02 08:13:30 -06002480
Alex Eldera725f65e2012-02-02 08:13:30 -06002481 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002482
2483out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002484 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002485 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002486 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002487 rbd_dev->image_name = NULL;
2488 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002489 kfree(rbd_dev->pool_name);
2490 rbd_dev->pool_name = NULL;
2491
2492 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002493}
2494
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002495static ssize_t rbd_add(struct bus_type *bus,
2496 const char *buf,
2497 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002498{
Alex Eldercb8627c2012-07-09 21:04:23 -05002499 char *options;
2500 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002501 const char *mon_addrs = NULL;
2502 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002503 struct ceph_osd_client *osdc;
2504 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002505
2506 if (!try_module_get(THIS_MODULE))
2507 return -ENODEV;
2508
Alex Elder27cc2592012-02-02 08:13:30 -06002509 options = kmalloc(count, GFP_KERNEL);
2510 if (!options)
2511 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002512 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2513 if (!rbd_dev)
2514 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002515
2516 /* static rbd_device initialization */
2517 spin_lock_init(&rbd_dev->lock);
2518 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002519 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002520 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002521
Alex Elderd184f6b2012-01-29 13:57:44 -06002522 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002523 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002524
Alex Eldera725f65e2012-02-02 08:13:30 -06002525 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002526 BUILD_BUG_ON(DEV_NAME_LEN
2527 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002528 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a82f2012-01-29 13:57:44 -06002529
Alex Eldera725f65e2012-02-02 08:13:30 -06002530 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002531 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002532 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002533 if (rc)
2534 goto err_put_id;
2535
Alex Elder5214ecc2012-02-02 08:13:30 -06002536 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2537 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002538 if (IS_ERR(rbd_dev->rbd_client)) {
2539 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002540 rbd_dev->rbd_client = NULL;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002541 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002542 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002543
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002544 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002545 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002546 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2547 if (rc < 0)
2548 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002549 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002550
2551 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002552 rc = register_blkdev(0, rbd_dev->name);
2553 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002554 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002555 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002556
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002557 rc = rbd_bus_add_dev(rbd_dev);
2558 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002559 goto err_out_blkdev;
2560
Alex Elder32eec682012-02-08 16:11:14 -06002561 /*
2562 * At this point cleanup in the event of an error is the job
2563 * of the sysfs code (initiated by rbd_bus_del_dev()).
2564 *
2565 * Set up and announce blkdev mapping.
2566 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002567 rc = rbd_init_disk(rbd_dev);
2568 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002569 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002570
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002571 rc = rbd_init_watch_dev(rbd_dev);
2572 if (rc)
2573 goto err_out_bus;
2574
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002575 return count;
2576
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002577err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002578 /* this will also clean up rest of rbd_dev stuff */
2579
2580 rbd_bus_del_dev(rbd_dev);
2581 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002582 return rc;
2583
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002584err_out_blkdev:
2585 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2586err_out_client:
2587 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002588err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002589 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002590 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002591 kfree(rbd_dev->header_name);
2592 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002593 kfree(rbd_dev->pool_name);
2594 }
Alex Elder499afd52012-02-02 08:13:29 -06002595 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002596err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002597 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002598 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002599
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002600 dout("Error adding device %s\n", buf);
2601 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002602
2603 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002604}
2605
Alex Elderde71a292012-07-03 16:01:19 -05002606static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002607{
2608 struct list_head *tmp;
2609 struct rbd_device *rbd_dev;
2610
Alex Eldere124a82f2012-01-29 13:57:44 -06002611 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002612 list_for_each(tmp, &rbd_dev_list) {
2613 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002614 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06002615 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002616 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002617 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002618 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002619 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002620 return NULL;
2621}
2622
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002623static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002624{
Alex Elder593a9e72012-02-07 12:03:37 -06002625 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002626
Alex Elder1dbb4392012-01-24 10:08:37 -06002627 if (rbd_dev->watch_request) {
2628 struct ceph_client *client = rbd_dev->rbd_client->client;
2629
2630 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002631 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002632 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002633 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002634 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002635
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002636 rbd_put_client(rbd_dev);
2637
2638 /* clean up and free blkdev */
2639 rbd_free_disk(rbd_dev);
2640 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002641
2642 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002643 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002644 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002645 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002646 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002647 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002648 kfree(rbd_dev);
2649
2650 /* release module ref */
2651 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002652}
2653
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002654static ssize_t rbd_remove(struct bus_type *bus,
2655 const char *buf,
2656 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002657{
2658 struct rbd_device *rbd_dev = NULL;
2659 int target_id, rc;
2660 unsigned long ul;
2661 int ret = count;
2662
2663 rc = strict_strtoul(buf, 10, &ul);
2664 if (rc)
2665 return rc;
2666
2667 /* convert to int; abort if we lost anything in the conversion */
2668 target_id = (int) ul;
2669 if (target_id != ul)
2670 return -EINVAL;
2671
2672 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2673
2674 rbd_dev = __rbd_get_dev(target_id);
2675 if (!rbd_dev) {
2676 ret = -ENOENT;
2677 goto done;
2678 }
2679
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002680 __rbd_remove_all_snaps(rbd_dev);
2681 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002682
2683done:
2684 mutex_unlock(&ctl_mutex);
2685 return ret;
2686}
2687
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002688static ssize_t rbd_snap_add(struct device *dev,
2689 struct device_attribute *attr,
2690 const char *buf,
2691 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002692{
Alex Elder593a9e72012-02-07 12:03:37 -06002693 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002694 int ret;
2695 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002696 if (!name)
2697 return -ENOMEM;
2698
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002699 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002700
2701 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2702
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002703 ret = rbd_header_add_snap(rbd_dev,
2704 name, GFP_KERNEL);
2705 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002706 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002707
Alex Elderb8136232012-07-25 09:32:41 -05002708 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002709 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002710 goto err_unlock;
2711
2712 /* shouldn't hold ctl_mutex when notifying.. notify might
2713 trigger a watch callback that would need to get that mutex */
2714 mutex_unlock(&ctl_mutex);
2715
2716 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002717 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002718
2719 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002720 kfree(name);
2721 return ret;
2722
2723err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002724 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002725 kfree(name);
2726 return ret;
2727}
2728
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002729/*
2730 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002731 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002732 */
2733static int rbd_sysfs_init(void)
2734{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002735 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002736
Alex Elderfed4c142012-02-07 12:03:36 -06002737 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002738 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002739 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002740
Alex Elderfed4c142012-02-07 12:03:36 -06002741 ret = bus_register(&rbd_bus_type);
2742 if (ret < 0)
2743 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002744
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002745 return ret;
2746}
2747
2748static void rbd_sysfs_cleanup(void)
2749{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002750 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002751 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002752}
2753
2754int __init rbd_init(void)
2755{
2756 int rc;
2757
2758 rc = rbd_sysfs_init();
2759 if (rc)
2760 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002761 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002762 return 0;
2763}
2764
2765void __exit rbd_exit(void)
2766{
2767 rbd_sysfs_cleanup();
2768}
2769
2770module_init(rbd_init);
2771module_exit(rbd_exit);
2772
2773MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2774MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2775MODULE_DESCRIPTION("rados block device");
2776
2777/* following authorship retained from original osdblk.c */
2778MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2779
2780MODULE_LICENSE("GPL");