blob: ac8a83fc2ad9b77132fd778608b7671d5e455d85 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
85 u64 snap_seq;
86 u32 total_snaps;
87
88 char *snap_names;
89 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090
91 u64 obj_version;
92};
93
94struct rbd_options {
95 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096};
97
98/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060099 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700100 */
101struct rbd_client {
102 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700103 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104 struct kref kref;
105 struct list_head node;
106};
107
108/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600109 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700110 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700111struct rbd_req_status {
112 int done;
113 int rc;
114 u64 bytes;
115};
116
117/*
118 * a collection of requests
119 */
120struct rbd_req_coll {
121 int total;
122 int num_done;
123 struct kref kref;
124 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700125};
126
Alex Elderf0f8cef2012-01-29 13:57:44 -0600127/*
128 * a single io request
129 */
130struct rbd_request {
131 struct request *rq; /* blk layer request */
132 struct bio *bio; /* cloned bio */
133 struct page **pages; /* list of used pages */
134 u64 len;
135 int coll_index;
136 struct rbd_req_coll *coll;
137};
138
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800139struct rbd_snap {
140 struct device dev;
141 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800142 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800143 struct list_head node;
144 u64 id;
145};
146
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700147/*
148 * a single device
149 */
150struct rbd_device {
151 int id; /* blkdev unique id */
152
153 int major; /* blkdev assigned major */
154 struct gendisk *disk; /* blkdev's gendisk and rq */
155 struct request_queue *q;
156
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157 struct rbd_client *rbd_client;
158
159 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
160
161 spinlock_t lock; /* queue lock */
162
163 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500164 char *image_name;
165 size_t image_name_len;
166 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500167 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500168 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700170 struct ceph_osd_event *watch_event;
171 struct ceph_osd_request *watch_request;
172
Josh Durginc6666012011-11-21 17:11:12 -0800173 /* protects updating the header */
174 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800175 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500176 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800177 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800178 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800179 /* whether the snap_id this device reads from still exists */
180 bool snap_exists;
181 int read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
183 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800184
185 /* list of snapshots */
186 struct list_head snaps;
187
188 /* sysfs related */
189 struct device dev;
190};
191
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700192static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600193
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700194static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600195static DEFINE_SPINLOCK(rbd_dev_list_lock);
196
Alex Elder432b8582012-01-29 13:57:44 -0600197static LIST_HEAD(rbd_client_list); /* clients */
198static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800200static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
201static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800202static ssize_t rbd_snap_add(struct device *dev,
203 struct device_attribute *attr,
204 const char *buf,
205 size_t count);
206static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700207 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800208
Alex Elderf0f8cef2012-01-29 13:57:44 -0600209static ssize_t rbd_add(struct bus_type *bus, const char *buf,
210 size_t count);
211static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
212 size_t count);
213
214static struct bus_attribute rbd_bus_attrs[] = {
215 __ATTR(add, S_IWUSR, NULL, rbd_add),
216 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
217 __ATTR_NULL
218};
219
220static struct bus_type rbd_bus_type = {
221 .name = "rbd",
222 .bus_attrs = rbd_bus_attrs,
223};
224
225static void rbd_root_dev_release(struct device *dev)
226{
227}
228
229static struct device rbd_root_dev = {
230 .init_name = "rbd",
231 .release = rbd_root_dev_release,
232};
233
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
236{
237 return get_device(&rbd_dev->dev);
238}
239
240static void rbd_put_dev(struct rbd_device *rbd_dev)
241{
242 put_device(&rbd_dev->dev);
243}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700244
Josh Durgin263c6ca2011-12-05 10:43:42 -0800245static int __rbd_refresh_header(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700246
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700247static int rbd_open(struct block_device *bdev, fmode_t mode)
248{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600249 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800251 rbd_get_dev(rbd_dev);
252
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700253 set_device_ro(bdev, rbd_dev->read_only);
254
255 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
256 return -EROFS;
257
258 return 0;
259}
260
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800261static int rbd_release(struct gendisk *disk, fmode_t mode)
262{
263 struct rbd_device *rbd_dev = disk->private_data;
264
265 rbd_put_dev(rbd_dev);
266
267 return 0;
268}
269
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270static const struct block_device_operations rbd_bd_ops = {
271 .owner = THIS_MODULE,
272 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800273 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700274};
275
276/*
277 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500278 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279 */
Alex Elder43ae4702012-07-03 16:01:18 -0500280static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700281 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700282{
283 struct rbd_client *rbdc;
284 int ret = -ENOMEM;
285
286 dout("rbd_client_create\n");
287 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
288 if (!rbdc)
289 goto out_opt;
290
291 kref_init(&rbdc->kref);
292 INIT_LIST_HEAD(&rbdc->node);
293
Alex Elderbc534d82012-01-29 13:57:44 -0600294 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
295
Alex Elder43ae4702012-07-03 16:01:18 -0500296 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600298 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500299 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700300
301 ret = ceph_open_session(rbdc->client);
302 if (ret < 0)
303 goto out_err;
304
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700305 rbdc->rbd_opts = rbd_opts;
306
Alex Elder432b8582012-01-29 13:57:44 -0600307 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700308 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600309 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310
Alex Elderbc534d82012-01-29 13:57:44 -0600311 mutex_unlock(&ctl_mutex);
312
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700313 dout("rbd_client_create created %p\n", rbdc);
314 return rbdc;
315
316out_err:
317 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600318out_mutex:
319 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700320 kfree(rbdc);
321out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500322 if (ceph_opts)
323 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400324 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700325}
326
327/*
328 * Find a ceph client with specific addr and configuration.
329 */
Alex Elder43ae4702012-07-03 16:01:18 -0500330static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700331{
332 struct rbd_client *client_node;
333
Alex Elder43ae4702012-07-03 16:01:18 -0500334 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700335 return NULL;
336
337 list_for_each_entry(client_node, &rbd_client_list, node)
Alex Elder43ae4702012-07-03 16:01:18 -0500338 if (!ceph_compare_options(ceph_opts, client_node->client))
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 return client_node;
340 return NULL;
341}
342
343/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700344 * mount options
345 */
346enum {
347 Opt_notify_timeout,
348 Opt_last_int,
349 /* int args above */
350 Opt_last_string,
351 /* string args above */
352};
353
Alex Elder43ae4702012-07-03 16:01:18 -0500354static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700355 {Opt_notify_timeout, "notify_timeout=%d"},
356 /* int args above */
357 /* string args above */
358 {-1, NULL}
359};
360
361static int parse_rbd_opts_token(char *c, void *private)
362{
Alex Elder43ae4702012-07-03 16:01:18 -0500363 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700364 substring_t argstr[MAX_OPT_ARGS];
365 int token, intval, ret;
366
Alex Elder43ae4702012-07-03 16:01:18 -0500367 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700368 if (token < 0)
369 return -EINVAL;
370
371 if (token < Opt_last_int) {
372 ret = match_int(&argstr[0], &intval);
373 if (ret < 0) {
374 pr_err("bad mount option arg (not int) "
375 "at '%s'\n", c);
376 return ret;
377 }
378 dout("got int token %d val %d\n", token, intval);
379 } else if (token > Opt_last_int && token < Opt_last_string) {
380 dout("got string token %d val %s\n", token,
381 argstr[0].from);
382 } else {
383 dout("got token %d\n", token);
384 }
385
386 switch (token) {
387 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500388 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700389 break;
390 default:
391 BUG_ON(token);
392 }
393 return 0;
394}
395
396/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700397 * Get a ceph client with specific addr and configuration, if one does
398 * not exist create it.
399 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600400static struct rbd_client *rbd_get_client(const char *mon_addr,
401 size_t mon_addr_len,
402 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700403{
404 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500405 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700406 struct rbd_options *rbd_opts;
407
408 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
409 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600410 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700411
412 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700413
Alex Elder43ae4702012-07-03 16:01:18 -0500414 ceph_opts = ceph_parse_options(options, mon_addr,
415 mon_addr + mon_addr_len,
416 parse_rbd_opts_token, rbd_opts);
417 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600418 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500419 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600420 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421
Alex Elder432b8582012-01-29 13:57:44 -0600422 spin_lock(&rbd_client_list_lock);
Alex Elder43ae4702012-07-03 16:01:18 -0500423 rbdc = __rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700424 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600425 /* using an existing client */
426 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600427 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600428
Alex Elder43ae4702012-07-03 16:01:18 -0500429 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600430 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431
Alex Elderd720bcb2012-02-02 08:13:30 -0600432 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433 }
Alex Elder432b8582012-01-29 13:57:44 -0600434 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435
Alex Elder43ae4702012-07-03 16:01:18 -0500436 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600437
Alex Elderd720bcb2012-02-02 08:13:30 -0600438 if (IS_ERR(rbdc))
439 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440
Alex Elderd720bcb2012-02-02 08:13:30 -0600441 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700442}
443
444/*
445 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600446 *
Alex Elder432b8582012-01-29 13:57:44 -0600447 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700448 */
449static void rbd_client_release(struct kref *kref)
450{
451 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
452
453 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500454 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700455 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500456 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457
458 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700459 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700460 kfree(rbdc);
461}
462
463/*
464 * Drop reference to ceph client node. If it's not referenced anymore, release
465 * it.
466 */
467static void rbd_put_client(struct rbd_device *rbd_dev)
468{
469 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
470 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471}
472
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700473/*
474 * Destroy requests collection
475 */
476static void rbd_coll_release(struct kref *kref)
477{
478 struct rbd_req_coll *coll =
479 container_of(kref, struct rbd_req_coll, kref);
480
481 dout("rbd_coll_release %p\n", coll);
482 kfree(coll);
483}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700484
485/*
486 * Create a new header structure, translate header format from the on-disk
487 * header.
488 */
489static int rbd_header_from_disk(struct rbd_image_header *header,
490 struct rbd_image_header_ondisk *ondisk,
Xi Wang50f7c4c2012-04-20 15:49:44 -0500491 u32 allocated_snaps,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700492 gfp_t gfp_flags)
493{
Xi Wang50f7c4c2012-04-20 15:49:44 -0500494 u32 i, snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495
Alex Elder21079782012-01-24 10:08:36 -0600496 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800497 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800498
Alex Elder00f1f362012-02-07 12:03:36 -0600499 snap_count = le32_to_cpu(ondisk->snap_count);
Xi Wang50f7c4c2012-04-20 15:49:44 -0500500 if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
501 / sizeof (*ondisk))
502 return -EINVAL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Yan, Zhengf9f9a192012-06-06 09:15:33 -0500504 snap_count * sizeof(u64),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700505 gfp_flags);
506 if (!header->snapc)
507 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600508
Alex Elder00f1f362012-02-07 12:03:36 -0600509 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700510 if (snap_count) {
511 header->snap_names = kmalloc(header->snap_names_len,
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500512 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700513 if (!header->snap_names)
514 goto err_snapc;
515 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500516 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700517 if (!header->snap_sizes)
518 goto err_names;
519 } else {
520 header->snap_names = NULL;
521 header->snap_sizes = NULL;
522 }
Alex Elder849b4262012-07-09 21:04:24 -0500523
524 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
525 gfp_flags);
526 if (!header->object_prefix)
527 goto err_sizes;
528
Alex Elderca1e49a2012-07-10 20:30:09 -0500529 memcpy(header->object_prefix, ondisk->block_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700530 sizeof(ondisk->block_name));
Alex Elder849b4262012-07-09 21:04:24 -0500531 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700532
533 header->image_size = le64_to_cpu(ondisk->image_size);
534 header->obj_order = ondisk->options.order;
535 header->crypt_type = ondisk->options.crypt_type;
536 header->comp_type = ondisk->options.comp_type;
537
538 atomic_set(&header->snapc->nref, 1);
539 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
540 header->snapc->num_snaps = snap_count;
541 header->total_snaps = snap_count;
542
Alex Elder21079782012-01-24 10:08:36 -0600543 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700544 for (i = 0; i < snap_count; i++) {
545 header->snapc->snaps[i] =
546 le64_to_cpu(ondisk->snaps[i].id);
547 header->snap_sizes[i] =
548 le64_to_cpu(ondisk->snaps[i].image_size);
549 }
550
551 /* copy snapshot names */
552 memcpy(header->snap_names, &ondisk->snaps[i],
553 header->snap_names_len);
554 }
555
556 return 0;
557
Alex Elder849b4262012-07-09 21:04:24 -0500558err_sizes:
559 kfree(header->snap_sizes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700560err_names:
561 kfree(header->snap_names);
562err_snapc:
563 kfree(header->snapc);
Alex Elder00f1f362012-02-07 12:03:36 -0600564 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565}
566
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700567static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
568 u64 *seq, u64 *size)
569{
570 int i;
571 char *p = header->snap_names;
572
Alex Elder00f1f362012-02-07 12:03:36 -0600573 for (i = 0; i < header->total_snaps; i++) {
574 if (!strcmp(snap_name, p)) {
575
576 /* Found it. Pass back its id and/or size */
577
578 if (seq)
579 *seq = header->snapc->snaps[i];
580 if (size)
581 *size = header->snap_sizes[i];
582 return i;
583 }
584 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 }
Alex Elder00f1f362012-02-07 12:03:36 -0600586 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587}
588
Alex Elder0ce1a792012-07-03 16:01:18 -0500589static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590{
Alex Elder78dc4472012-07-19 08:49:18 -0500591 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592
Alex Elder0ce1a792012-07-03 16:01:18 -0500593 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700594
Alex Elder0ce1a792012-07-03 16:01:18 -0500595 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800596 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500597 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800598 rbd_dev->snap_exists = false;
Alex Elder0ce1a792012-07-03 16:01:18 -0500599 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500601 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700602 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500603 u64 snap_id = 0;
604
605 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
606 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700607 if (ret < 0)
608 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500609 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800610 rbd_dev->snap_exists = true;
Alex Elder0ce1a792012-07-03 16:01:18 -0500611 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 }
613
614 ret = 0;
615done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500616 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700617 return ret;
618}
619
620static void rbd_header_free(struct rbd_image_header *header)
621{
Alex Elder849b4262012-07-09 21:04:24 -0500622 kfree(header->object_prefix);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700623 kfree(header->snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -0500624 kfree(header->snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -0800625 ceph_put_snap_context(header->snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626}
627
628/*
629 * get the actual striped segment name, offset and length
630 */
631static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500632 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700633 u64 ofs, u64 len,
634 char *seg_name, u64 *segofs)
635{
636 u64 seg = ofs >> header->obj_order;
637
638 if (seg_name)
639 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500640 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700641
642 ofs = ofs & ((1 << header->obj_order) - 1);
643 len = min_t(u64, len, (1 << header->obj_order) - ofs);
644
645 if (segofs)
646 *segofs = ofs;
647
648 return len;
649}
650
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700651static int rbd_get_num_segments(struct rbd_image_header *header,
652 u64 ofs, u64 len)
653{
654 u64 start_seg = ofs >> header->obj_order;
655 u64 end_seg = (ofs + len - 1) >> header->obj_order;
656 return end_seg - start_seg + 1;
657}
658
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700660 * returns the size of an object in the image
661 */
662static u64 rbd_obj_bytes(struct rbd_image_header *header)
663{
664 return 1 << header->obj_order;
665}
666
667/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668 * bio helpers
669 */
670
671static void bio_chain_put(struct bio *chain)
672{
673 struct bio *tmp;
674
675 while (chain) {
676 tmp = chain;
677 chain = chain->bi_next;
678 bio_put(tmp);
679 }
680}
681
682/*
683 * zeros a bio chain, starting at specific offset
684 */
685static void zero_bio_chain(struct bio *chain, int start_ofs)
686{
687 struct bio_vec *bv;
688 unsigned long flags;
689 void *buf;
690 int i;
691 int pos = 0;
692
693 while (chain) {
694 bio_for_each_segment(bv, chain, i) {
695 if (pos + bv->bv_len > start_ofs) {
696 int remainder = max(start_ofs - pos, 0);
697 buf = bvec_kmap_irq(bv, &flags);
698 memset(buf + remainder, 0,
699 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200700 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700701 }
702 pos += bv->bv_len;
703 }
704
705 chain = chain->bi_next;
706 }
707}
708
709/*
710 * bio_chain_clone - clone a chain of bios up to a certain length.
711 * might return a bio_pair that will need to be released.
712 */
713static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
714 struct bio_pair **bp,
715 int len, gfp_t gfpmask)
716{
717 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
718 int total = 0;
719
720 if (*bp) {
721 bio_pair_release(*bp);
722 *bp = NULL;
723 }
724
725 while (old_chain && (total < len)) {
726 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
727 if (!tmp)
728 goto err_out;
729
730 if (total + old_chain->bi_size > len) {
731 struct bio_pair *bp;
732
733 /*
734 * this split can only happen with a single paged bio,
735 * split_bio will BUG_ON if this is not the case
736 */
737 dout("bio_chain_clone split! total=%d remaining=%d"
738 "bi_size=%d\n",
739 (int)total, (int)len-total,
740 (int)old_chain->bi_size);
741
742 /* split the bio. We'll release it either in the next
743 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600744 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745 if (!bp)
746 goto err_out;
747
748 __bio_clone(tmp, &bp->bio1);
749
750 *next = &bp->bio2;
751 } else {
752 __bio_clone(tmp, old_chain);
753 *next = old_chain->bi_next;
754 }
755
756 tmp->bi_bdev = NULL;
757 gfpmask &= ~__GFP_WAIT;
758 tmp->bi_next = NULL;
759
760 if (!new_chain) {
761 new_chain = tail = tmp;
762 } else {
763 tail->bi_next = tmp;
764 tail = tmp;
765 }
766 old_chain = old_chain->bi_next;
767
768 total += tmp->bi_size;
769 }
770
771 BUG_ON(total < len);
772
773 if (tail)
774 tail->bi_next = NULL;
775
776 *old = old_chain;
777
778 return new_chain;
779
780err_out:
781 dout("bio_chain_clone with err\n");
782 bio_chain_put(new_chain);
783 return NULL;
784}
785
786/*
787 * helpers for osd request op vectors.
788 */
789static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
790 int num_ops,
791 int opcode,
792 u32 payload_len)
793{
794 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
795 GFP_NOIO);
796 if (!*ops)
797 return -ENOMEM;
798 (*ops)[0].op = opcode;
799 /*
800 * op extent offset and length will be set later on
801 * in calc_raw_layout()
802 */
803 (*ops)[0].payload_len = payload_len;
804 return 0;
805}
806
807static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
808{
809 kfree(ops);
810}
811
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700812static void rbd_coll_end_req_index(struct request *rq,
813 struct rbd_req_coll *coll,
814 int index,
815 int ret, u64 len)
816{
817 struct request_queue *q;
818 int min, max, i;
819
820 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
821 coll, index, ret, len);
822
823 if (!rq)
824 return;
825
826 if (!coll) {
827 blk_end_request(rq, ret, len);
828 return;
829 }
830
831 q = rq->q;
832
833 spin_lock_irq(q->queue_lock);
834 coll->status[index].done = 1;
835 coll->status[index].rc = ret;
836 coll->status[index].bytes = len;
837 max = min = coll->num_done;
838 while (max < coll->total && coll->status[max].done)
839 max++;
840
841 for (i = min; i<max; i++) {
842 __blk_end_request(rq, coll->status[i].rc,
843 coll->status[i].bytes);
844 coll->num_done++;
845 kref_put(&coll->kref, rbd_coll_release);
846 }
847 spin_unlock_irq(q->queue_lock);
848}
849
850static void rbd_coll_end_req(struct rbd_request *req,
851 int ret, u64 len)
852{
853 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
854}
855
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700856/*
857 * Send ceph osd request
858 */
859static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500860 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700861 struct ceph_snap_context *snapc,
862 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500863 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700864 struct bio *bio,
865 struct page **pages,
866 int num_pages,
867 int flags,
868 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700869 struct rbd_req_coll *coll,
870 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700871 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700872 struct ceph_msg *msg),
873 struct ceph_osd_request **linger_req,
874 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700875{
876 struct ceph_osd_request *req;
877 struct ceph_file_layout *layout;
878 int ret;
879 u64 bno;
880 struct timespec mtime = CURRENT_TIME;
881 struct rbd_request *req_data;
882 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600883 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700886 if (!req_data) {
887 if (coll)
888 rbd_coll_end_req_index(rq, coll, coll_index,
889 -ENOMEM, len);
890 return -ENOMEM;
891 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700893 if (coll) {
894 req_data->coll = coll;
895 req_data->coll_index = coll_index;
896 }
897
Alex Elderaded07e2012-07-03 16:01:18 -0500898 dout("rbd_do_request object_name=%s ofs=%lld len=%lld\n",
899 object_name, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700900
Alex Elder0ce1a792012-07-03 16:01:18 -0500901 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600902 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
903 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700904 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700905 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700906 goto done_pages;
907 }
908
909 req->r_callback = rbd_cb;
910
911 req_data->rq = rq;
912 req_data->bio = bio;
913 req_data->pages = pages;
914 req_data->len = len;
915
916 req->r_priv = req_data;
917
918 reqhead = req->r_request->front.iov_base;
919 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
920
Alex Elderaded07e2012-07-03 16:01:18 -0500921 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700922 req->r_oid_len = strlen(req->r_oid);
923
924 layout = &req->r_file_layout;
925 memset(layout, 0, sizeof(*layout));
926 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
927 layout->fl_stripe_count = cpu_to_le32(1);
928 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500929 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600930 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
931 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700932
933 ceph_osdc_build_request(req, ofs, &len,
934 ops,
935 snapc,
936 &mtime,
937 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700938
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700939 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600940 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700941 *linger_req = req;
942 }
943
Alex Elder1dbb4392012-01-24 10:08:37 -0600944 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700945 if (ret < 0)
946 goto done_err;
947
948 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600949 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700950 if (ver)
951 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700952 dout("reassert_ver=%lld\n",
953 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700954 ceph_osdc_put_request(req);
955 }
956 return ret;
957
958done_err:
959 bio_chain_put(req_data->bio);
960 ceph_osdc_put_request(req);
961done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700962 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700964 return ret;
965}
966
967/*
968 * Ceph osd op callback
969 */
970static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
971{
972 struct rbd_request *req_data = req->r_priv;
973 struct ceph_osd_reply_head *replyhead;
974 struct ceph_osd_op *op;
975 __s32 rc;
976 u64 bytes;
977 int read_op;
978
979 /* parse reply */
980 replyhead = msg->front.iov_base;
981 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
982 op = (void *)(replyhead + 1);
983 rc = le32_to_cpu(replyhead->result);
984 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -0500985 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700986
987 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
988
989 if (rc == -ENOENT && read_op) {
990 zero_bio_chain(req_data->bio, 0);
991 rc = 0;
992 } else if (rc == 0 && read_op && bytes < req_data->len) {
993 zero_bio_chain(req_data->bio, bytes);
994 bytes = req_data->len;
995 }
996
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700997 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700998
999 if (req_data->bio)
1000 bio_chain_put(req_data->bio);
1001
1002 ceph_osdc_put_request(req);
1003 kfree(req_data);
1004}
1005
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001006static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1007{
1008 ceph_osdc_put_request(req);
1009}
1010
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001011/*
1012 * Do a synchronous ceph osd operation
1013 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001014static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015 struct ceph_snap_context *snapc,
1016 u64 snapid,
1017 int opcode,
1018 int flags,
1019 struct ceph_osd_req_op *orig_ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001020 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001021 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001022 char *buf,
1023 struct ceph_osd_request **linger_req,
1024 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001025{
1026 int ret;
1027 struct page **pages;
1028 int num_pages;
1029 struct ceph_osd_req_op *ops = orig_ops;
1030 u32 payload_len;
1031
1032 num_pages = calc_pages_for(ofs , len);
1033 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001034 if (IS_ERR(pages))
1035 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036
1037 if (!orig_ops) {
1038 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1039 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1040 if (ret < 0)
1041 goto done;
1042
1043 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1044 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1045 if (ret < 0)
1046 goto done_ops;
1047 }
1048 }
1049
Alex Elder0ce1a792012-07-03 16:01:18 -05001050 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001051 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001052 pages, num_pages,
1053 flags,
1054 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001055 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001056 NULL,
1057 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058 if (ret < 0)
1059 goto done_ops;
1060
1061 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1062 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1063
1064done_ops:
1065 if (!orig_ops)
1066 rbd_destroy_ops(ops);
1067done:
1068 ceph_release_page_vector(pages, num_pages);
1069 return ret;
1070}
1071
1072/*
1073 * Do an asynchronous ceph osd operation
1074 */
1075static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001076 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001077 struct ceph_snap_context *snapc,
1078 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001079 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001081 struct bio *bio,
1082 struct rbd_req_coll *coll,
1083 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084{
1085 char *seg_name;
1086 u64 seg_ofs;
1087 u64 seg_len;
1088 int ret;
1089 struct ceph_osd_req_op *ops;
1090 u32 payload_len;
1091
1092 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1093 if (!seg_name)
1094 return -ENOMEM;
1095
1096 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001097 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001098 ofs, len,
1099 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001100
1101 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1102
1103 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1104 if (ret < 0)
1105 goto done;
1106
1107 /* we've taken care of segment sizes earlier when we
1108 cloned the bios. We should never have a segment
1109 truncated at this point */
1110 BUG_ON(seg_len < len);
1111
1112 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1113 seg_name, seg_ofs, seg_len,
1114 bio,
1115 NULL, 0,
1116 flags,
1117 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001118 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001119 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001120
1121 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122done:
1123 kfree(seg_name);
1124 return ret;
1125}
1126
1127/*
1128 * Request async osd write
1129 */
1130static int rbd_req_write(struct request *rq,
1131 struct rbd_device *rbd_dev,
1132 struct ceph_snap_context *snapc,
1133 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001134 struct bio *bio,
1135 struct rbd_req_coll *coll,
1136 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137{
1138 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1139 CEPH_OSD_OP_WRITE,
1140 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001141 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001142}
1143
1144/*
1145 * Request async osd read
1146 */
1147static int rbd_req_read(struct request *rq,
1148 struct rbd_device *rbd_dev,
1149 u64 snapid,
1150 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001151 struct bio *bio,
1152 struct rbd_req_coll *coll,
1153 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154{
1155 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001156 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001157 CEPH_OSD_OP_READ,
1158 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001159 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001160}
1161
1162/*
1163 * Request sync osd read
1164 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001165static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001166 struct ceph_snap_context *snapc,
1167 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001168 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001169 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001170 char *buf,
1171 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172{
Alex Elder0ce1a792012-07-03 16:01:18 -05001173 return rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001174 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001175 CEPH_OSD_OP_READ,
1176 CEPH_OSD_FLAG_READ,
1177 NULL,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001178 object_name, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179}
1180
1181/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001182 * Request sync osd watch
1183 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001184static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001185 u64 ver,
1186 u64 notify_id,
Alex Elderaded07e2012-07-03 16:01:18 -05001187 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001188{
1189 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001190 int ret;
1191
1192 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001193 if (ret < 0)
1194 return ret;
1195
Josh Durgina71b8912011-12-05 18:10:44 -08001196 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001197 ops[0].watch.cookie = notify_id;
1198 ops[0].watch.flag = 0;
1199
Alex Elder0ce1a792012-07-03 16:01:18 -05001200 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderaded07e2012-07-03 16:01:18 -05001201 object_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001202 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001203 CEPH_OSD_FLAG_READ,
1204 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001205 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001206 rbd_simple_req_cb, 0, NULL);
1207
1208 rbd_destroy_ops(ops);
1209 return ret;
1210}
1211
1212static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1213{
Alex Elder0ce1a792012-07-03 16:01:18 -05001214 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001215 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001216 int rc;
1217
Alex Elder0ce1a792012-07-03 16:01:18 -05001218 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001219 return;
1220
Alex Elder0bed54d2012-07-03 16:01:18 -05001221 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n",
1222 rbd_dev->header_name, notify_id, (int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001223 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder0ce1a792012-07-03 16:01:18 -05001224 rc = __rbd_refresh_header(rbd_dev);
Josh Durgina71b8912011-12-05 18:10:44 -08001225 hver = rbd_dev->header.obj_version;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001226 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001227 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001228 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001229 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230
Josh Durgina71b8912011-12-05 18:10:44 -08001231 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001232}
1233
1234/*
1235 * Request sync osd watch
1236 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001237static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001238 const char *object_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001239 u64 ver)
1240{
1241 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001242 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001243
1244 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1245 if (ret < 0)
1246 return ret;
1247
1248 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001249 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001250 if (ret < 0)
1251 goto fail;
1252
1253 ops[0].watch.ver = cpu_to_le64(ver);
Alex Elder0ce1a792012-07-03 16:01:18 -05001254 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001255 ops[0].watch.flag = 1;
1256
Alex Elder0ce1a792012-07-03 16:01:18 -05001257 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001258 CEPH_NOSNAP,
1259 0,
1260 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1261 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001262 object_name, 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001263 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001264
1265 if (ret < 0)
1266 goto fail_event;
1267
1268 rbd_destroy_ops(ops);
1269 return 0;
1270
1271fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001272 ceph_osdc_cancel_event(rbd_dev->watch_event);
1273 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001274fail:
1275 rbd_destroy_ops(ops);
1276 return ret;
1277}
1278
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001279/*
1280 * Request sync osd unwatch
1281 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001282static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001283 const char *object_name)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001284{
1285 struct ceph_osd_req_op *ops;
1286
1287 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1288 if (ret < 0)
1289 return ret;
1290
1291 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001292 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001293 ops[0].watch.flag = 0;
1294
Alex Elder0ce1a792012-07-03 16:01:18 -05001295 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001296 CEPH_NOSNAP,
1297 0,
1298 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1299 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001300 object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001301
1302 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001303 ceph_osdc_cancel_event(rbd_dev->watch_event);
1304 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001305 return ret;
1306}
1307
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001308struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001309 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310};
1311
1312static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1313{
Alex Elder0ce1a792012-07-03 16:01:18 -05001314 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1315 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316 return;
1317
Alex Elder0ce1a792012-07-03 16:01:18 -05001318 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
Alex Elder0bed54d2012-07-03 16:01:18 -05001319 rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001320 notify_id, (int)opcode);
1321}
1322
1323/*
1324 * Request sync osd notify
1325 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001326static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001327 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001328{
1329 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001330 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001331 struct ceph_osd_event *event;
1332 struct rbd_notify_info info;
1333 int payload_len = sizeof(u32) + sizeof(u32);
1334 int ret;
1335
1336 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1337 if (ret < 0)
1338 return ret;
1339
Alex Elder0ce1a792012-07-03 16:01:18 -05001340 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341
1342 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1343 (void *)&info, &event);
1344 if (ret < 0)
1345 goto fail;
1346
1347 ops[0].watch.ver = 1;
1348 ops[0].watch.flag = 1;
1349 ops[0].watch.cookie = event->cookie;
1350 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1351 ops[0].watch.timeout = 12;
1352
Alex Elder0ce1a792012-07-03 16:01:18 -05001353 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001354 CEPH_NOSNAP,
1355 0,
1356 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1357 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001358 object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001359 if (ret < 0)
1360 goto fail_event;
1361
1362 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1363 dout("ceph_osdc_wait_event returned %d\n", ret);
1364 rbd_destroy_ops(ops);
1365 return 0;
1366
1367fail_event:
1368 ceph_osdc_cancel_event(event);
1369fail:
1370 rbd_destroy_ops(ops);
1371 return ret;
1372}
1373
1374/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001375 * Request sync osd read
1376 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001377static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001378 const char *object_name,
1379 const char *class_name,
1380 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001381 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001382 int len,
1383 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001384{
1385 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001386 int class_name_len = strlen(class_name);
1387 int method_name_len = strlen(method_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001388 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001389 class_name_len + method_name_len + len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001390 if (ret < 0)
1391 return ret;
1392
Alex Elderaded07e2012-07-03 16:01:18 -05001393 ops[0].cls.class_name = class_name;
1394 ops[0].cls.class_len = (__u8) class_name_len;
1395 ops[0].cls.method_name = method_name;
1396 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001397 ops[0].cls.argc = 0;
1398 ops[0].cls.indata = data;
1399 ops[0].cls.indata_len = len;
1400
Alex Elder0ce1a792012-07-03 16:01:18 -05001401 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001402 CEPH_NOSNAP,
1403 0,
1404 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1405 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001406 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001407
1408 rbd_destroy_ops(ops);
1409
1410 dout("cls_exec returned %d\n", ret);
1411 return ret;
1412}
1413
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001414static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1415{
1416 struct rbd_req_coll *coll =
1417 kzalloc(sizeof(struct rbd_req_coll) +
1418 sizeof(struct rbd_req_status) * num_reqs,
1419 GFP_ATOMIC);
1420
1421 if (!coll)
1422 return NULL;
1423 coll->total = num_reqs;
1424 kref_init(&coll->kref);
1425 return coll;
1426}
1427
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001428/*
1429 * block device queue callback
1430 */
1431static void rbd_rq_fn(struct request_queue *q)
1432{
1433 struct rbd_device *rbd_dev = q->queuedata;
1434 struct request *rq;
1435 struct bio_pair *bp = NULL;
1436
Alex Elder00f1f362012-02-07 12:03:36 -06001437 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001438 struct bio *bio;
1439 struct bio *rq_bio, *next_bio = NULL;
1440 bool do_write;
1441 int size, op_size = 0;
1442 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001443 int num_segs, cur_seg = 0;
1444 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001445 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001446
1447 /* peek at request from block layer */
1448 if (!rq)
1449 break;
1450
1451 dout("fetched request\n");
1452
1453 /* filter out block requests we don't understand */
1454 if ((rq->cmd_type != REQ_TYPE_FS)) {
1455 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001456 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001457 }
1458
1459 /* deduce our operation (read, write) */
1460 do_write = (rq_data_dir(rq) == WRITE);
1461
1462 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001463 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001464 rq_bio = rq->bio;
1465 if (do_write && rbd_dev->read_only) {
1466 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001467 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001468 }
1469
1470 spin_unlock_irq(q->queue_lock);
1471
Josh Durgind1d25642011-12-05 14:03:05 -08001472 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001473
Josh Durgind1d25642011-12-05 14:03:05 -08001474 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001475 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001476 dout("request for non-existent snapshot");
1477 spin_lock_irq(q->queue_lock);
1478 __blk_end_request_all(rq, -ENXIO);
1479 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001480 }
1481
Josh Durgind1d25642011-12-05 14:03:05 -08001482 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1483
1484 up_read(&rbd_dev->header_rwsem);
1485
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001486 dout("%s 0x%x bytes at 0x%llx\n",
1487 do_write ? "write" : "read",
Alex Elder593a9e72012-02-07 12:03:37 -06001488 size, blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001489
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001490 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1491 coll = rbd_alloc_coll(num_segs);
1492 if (!coll) {
1493 spin_lock_irq(q->queue_lock);
1494 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001495 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001496 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001497 }
1498
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001499 do {
1500 /* a bio clone to be passed down to OSD req */
1501 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1502 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001503 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001504 ofs, size,
1505 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001506 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001507 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1508 op_size, GFP_ATOMIC);
1509 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001510 rbd_coll_end_req_index(rq, coll, cur_seg,
1511 -ENOMEM, op_size);
1512 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513 }
1514
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001515
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001516 /* init OSD command: write or read */
1517 if (do_write)
1518 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001519 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521 op_size, bio,
1522 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001523 else
1524 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001525 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001526 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001527 op_size, bio,
1528 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001530next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001531 size -= op_size;
1532 ofs += op_size;
1533
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001534 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 rq_bio = next_bio;
1536 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001537 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001538
1539 if (bp)
1540 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001541 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001542
1543 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001544 }
1545}
1546
1547/*
1548 * a queue callback. Makes sure that we don't create a bio that spans across
1549 * multiple osd objects. One exception would be with a single page bios,
1550 * which we handle later at bio_chain_clone
1551 */
1552static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1553 struct bio_vec *bvec)
1554{
1555 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001556 unsigned int chunk_sectors;
1557 sector_t sector;
1558 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001559 int max;
1560
Alex Elder593a9e72012-02-07 12:03:37 -06001561 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1562 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1563 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1564
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001565 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001566 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567 if (max < 0)
1568 max = 0; /* bio_add cannot handle a negative return */
1569 if (max <= bvec->bv_len && bio_sectors == 0)
1570 return bvec->bv_len;
1571 return max;
1572}
1573
1574static void rbd_free_disk(struct rbd_device *rbd_dev)
1575{
1576 struct gendisk *disk = rbd_dev->disk;
1577
1578 if (!disk)
1579 return;
1580
1581 rbd_header_free(&rbd_dev->header);
1582
1583 if (disk->flags & GENHD_FL_UP)
1584 del_gendisk(disk);
1585 if (disk->queue)
1586 blk_cleanup_queue(disk->queue);
1587 put_disk(disk);
1588}
1589
1590/*
1591 * reload the ondisk the header
1592 */
1593static int rbd_read_header(struct rbd_device *rbd_dev,
1594 struct rbd_image_header *header)
1595{
1596 ssize_t rc;
1597 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001598 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001599 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001600 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001601
Alex Elder00f1f362012-02-07 12:03:36 -06001602 /*
1603 * First reads the fixed-size header to determine the number
1604 * of snapshots, then re-reads it, along with all snapshot
1605 * records as well as their stored names.
1606 */
1607 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001608 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001609 dh = kmalloc(len, GFP_KERNEL);
1610 if (!dh)
1611 return -ENOMEM;
1612
1613 rc = rbd_req_sync_read(rbd_dev,
1614 NULL, CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001615 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001616 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001617 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001618 if (rc < 0)
1619 goto out_dh;
1620
1621 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001622 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001623 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001624 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001625 " for image %s\n",
1626 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001627 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001628 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001629
Alex Elder00f1f362012-02-07 12:03:36 -06001630 if (snap_count == header->total_snaps)
1631 break;
1632
1633 snap_count = header->total_snaps;
1634 len = sizeof (*dh) +
1635 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1636 header->snap_names_len;
1637
1638 rbd_header_free(header);
1639 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001640 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001641 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001642
1643out_dh:
1644 kfree(dh);
1645 return rc;
1646}
1647
1648/*
1649 * create a snapshot
1650 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001651static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001652 const char *snap_name,
1653 gfp_t gfp_flags)
1654{
1655 int name_len = strlen(snap_name);
1656 u64 new_snapid;
1657 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001658 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001659 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001660 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001661
1662 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001663 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001664 return -EINVAL;
1665
Alex Elder0ce1a792012-07-03 16:01:18 -05001666 monc = &rbd_dev->rbd_client->client->monc;
1667 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001668 dout("created snapid=%lld\n", new_snapid);
1669 if (ret < 0)
1670 return ret;
1671
1672 data = kmalloc(name_len + 16, gfp_flags);
1673 if (!data)
1674 return -ENOMEM;
1675
Sage Weil916d4d62011-05-12 16:10:50 -07001676 p = data;
1677 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001678
Sage Weil916d4d62011-05-12 16:10:50 -07001679 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1680 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001681
Alex Elder0bed54d2012-07-03 16:01:18 -05001682 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001683 "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001684 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001685
Sage Weil916d4d62011-05-12 16:10:50 -07001686 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001687
1688 if (ret < 0)
1689 return ret;
1690
Alex Elder0ce1a792012-07-03 16:01:18 -05001691 down_write(&rbd_dev->header_rwsem);
1692 rbd_dev->header.snapc->seq = new_snapid;
1693 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001694
1695 return 0;
1696bad:
1697 return -ERANGE;
1698}
1699
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001700static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1701{
1702 struct rbd_snap *snap;
1703
1704 while (!list_empty(&rbd_dev->snaps)) {
1705 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1706 __rbd_remove_snap_dev(rbd_dev, snap);
1707 }
1708}
1709
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001710/*
1711 * only read the first part of the ondisk header, without the snaps info
1712 */
Josh Durgin263c6ca2011-12-05 10:43:42 -08001713static int __rbd_refresh_header(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001714{
1715 int ret;
1716 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001717
1718 ret = rbd_read_header(rbd_dev, &h);
1719 if (ret < 0)
1720 return ret;
1721
Josh Durgina51aa0c2011-12-05 10:35:04 -08001722 down_write(&rbd_dev->header_rwsem);
1723
Sage Weil9db4b3e2011-04-19 22:49:06 -07001724 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001725 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1726 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1727
1728 dout("setting size to %llu sectors", (unsigned long long) size);
1729 set_capacity(rbd_dev->disk, size);
1730 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001731
Alex Elder849b4262012-07-09 21:04:24 -05001732 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001733 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001734 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001735 /* osd requests may still refer to snapc */
1736 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001737
Josh Durgina71b8912011-12-05 18:10:44 -08001738 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001739 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001740 rbd_dev->header.total_snaps = h.total_snaps;
1741 rbd_dev->header.snapc = h.snapc;
1742 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001743 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001745 /* Free the extra copy of the object prefix */
1746 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1747 kfree(h.object_prefix);
1748
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001749 ret = __rbd_init_snaps_header(rbd_dev);
1750
Josh Durginc6666012011-11-21 17:11:12 -08001751 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001752
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001753 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001754}
1755
1756static int rbd_init_disk(struct rbd_device *rbd_dev)
1757{
1758 struct gendisk *disk;
1759 struct request_queue *q;
1760 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001761 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001762 u64 total_size = 0;
1763
1764 /* contact OSD, request size info about the object being mapped */
1765 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1766 if (rc)
1767 return rc;
1768
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001769 /* no need to lock here, as rbd_dev is not registered yet */
1770 rc = __rbd_init_snaps_header(rbd_dev);
1771 if (rc)
1772 return rc;
1773
Josh Durgincc9d7342011-11-21 18:19:13 -08001774 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775 if (rc)
1776 return rc;
1777
1778 /* create gendisk info */
1779 rc = -ENOMEM;
1780 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1781 if (!disk)
1782 goto out;
1783
Alex Elderf0f8cef2012-01-29 13:57:44 -06001784 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001785 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786 disk->major = rbd_dev->major;
1787 disk->first_minor = 0;
1788 disk->fops = &rbd_bd_ops;
1789 disk->private_data = rbd_dev;
1790
1791 /* init rq */
1792 rc = -ENOMEM;
1793 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1794 if (!q)
1795 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001796
Alex Elder593a9e72012-02-07 12:03:37 -06001797 /* We use the default size, but let's be explicit about it. */
1798 blk_queue_physical_block_size(q, SECTOR_SIZE);
1799
Josh Durgin029bcbd2011-07-22 11:35:23 -07001800 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001801 segment_size = rbd_obj_bytes(&rbd_dev->header);
1802 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1803 blk_queue_max_segment_size(q, segment_size);
1804 blk_queue_io_min(q, segment_size);
1805 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001806
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001807 blk_queue_merge_bvec(q, rbd_merge_bvec);
1808 disk->queue = q;
1809
1810 q->queuedata = rbd_dev;
1811
1812 rbd_dev->disk = disk;
1813 rbd_dev->q = q;
1814
1815 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001816 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001817 add_disk(disk);
1818
1819 pr_info("%s: added with size 0x%llx\n",
1820 disk->disk_name, (unsigned long long)total_size);
1821 return 0;
1822
1823out_disk:
1824 put_disk(disk);
1825out:
1826 return rc;
1827}
1828
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001829/*
1830 sysfs
1831*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832
Alex Elder593a9e72012-02-07 12:03:37 -06001833static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1834{
1835 return container_of(dev, struct rbd_device, dev);
1836}
1837
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001838static ssize_t rbd_size_show(struct device *dev,
1839 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840{
Alex Elder593a9e72012-02-07 12:03:37 -06001841 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001842 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001843
Josh Durgina51aa0c2011-12-05 10:35:04 -08001844 down_read(&rbd_dev->header_rwsem);
1845 size = get_capacity(rbd_dev->disk);
1846 up_read(&rbd_dev->header_rwsem);
1847
1848 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001849}
1850
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001851static ssize_t rbd_major_show(struct device *dev,
1852 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001853{
Alex Elder593a9e72012-02-07 12:03:37 -06001854 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001855
1856 return sprintf(buf, "%d\n", rbd_dev->major);
1857}
1858
1859static ssize_t rbd_client_id_show(struct device *dev,
1860 struct device_attribute *attr, char *buf)
1861{
Alex Elder593a9e72012-02-07 12:03:37 -06001862 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001863
Alex Elder1dbb4392012-01-24 10:08:37 -06001864 return sprintf(buf, "client%lld\n",
1865 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001866}
1867
1868static ssize_t rbd_pool_show(struct device *dev,
1869 struct device_attribute *attr, char *buf)
1870{
Alex Elder593a9e72012-02-07 12:03:37 -06001871 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001872
1873 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1874}
1875
Alex Elder9bb2f332012-07-12 10:46:35 -05001876static ssize_t rbd_pool_id_show(struct device *dev,
1877 struct device_attribute *attr, char *buf)
1878{
1879 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1880
1881 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1882}
1883
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001884static ssize_t rbd_name_show(struct device *dev,
1885 struct device_attribute *attr, char *buf)
1886{
Alex Elder593a9e72012-02-07 12:03:37 -06001887 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001888
Alex Elder0bed54d2012-07-03 16:01:18 -05001889 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001890}
1891
1892static ssize_t rbd_snap_show(struct device *dev,
1893 struct device_attribute *attr,
1894 char *buf)
1895{
Alex Elder593a9e72012-02-07 12:03:37 -06001896 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001897
1898 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1899}
1900
1901static ssize_t rbd_image_refresh(struct device *dev,
1902 struct device_attribute *attr,
1903 const char *buf,
1904 size_t size)
1905{
Alex Elder593a9e72012-02-07 12:03:37 -06001906 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001907 int rc;
1908 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001909
1910 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1911
Josh Durgin263c6ca2011-12-05 10:43:42 -08001912 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001913 if (rc < 0)
1914 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001915
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001916 mutex_unlock(&ctl_mutex);
1917 return ret;
1918}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001919
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001920static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1921static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1922static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1923static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001924static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001925static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1926static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1927static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1928static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001929
1930static struct attribute *rbd_attrs[] = {
1931 &dev_attr_size.attr,
1932 &dev_attr_major.attr,
1933 &dev_attr_client_id.attr,
1934 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001935 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001936 &dev_attr_name.attr,
1937 &dev_attr_current_snap.attr,
1938 &dev_attr_refresh.attr,
1939 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001940 NULL
1941};
1942
1943static struct attribute_group rbd_attr_group = {
1944 .attrs = rbd_attrs,
1945};
1946
1947static const struct attribute_group *rbd_attr_groups[] = {
1948 &rbd_attr_group,
1949 NULL
1950};
1951
1952static void rbd_sysfs_dev_release(struct device *dev)
1953{
1954}
1955
1956static struct device_type rbd_device_type = {
1957 .name = "rbd",
1958 .groups = rbd_attr_groups,
1959 .release = rbd_sysfs_dev_release,
1960};
1961
1962
1963/*
1964 sysfs - snapshots
1965*/
1966
1967static ssize_t rbd_snap_size_show(struct device *dev,
1968 struct device_attribute *attr,
1969 char *buf)
1970{
1971 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1972
Josh Durgin35915382011-12-05 18:25:13 -08001973 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001974}
1975
1976static ssize_t rbd_snap_id_show(struct device *dev,
1977 struct device_attribute *attr,
1978 char *buf)
1979{
1980 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1981
Josh Durgin35915382011-12-05 18:25:13 -08001982 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001983}
1984
1985static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1986static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1987
1988static struct attribute *rbd_snap_attrs[] = {
1989 &dev_attr_snap_size.attr,
1990 &dev_attr_snap_id.attr,
1991 NULL,
1992};
1993
1994static struct attribute_group rbd_snap_attr_group = {
1995 .attrs = rbd_snap_attrs,
1996};
1997
1998static void rbd_snap_dev_release(struct device *dev)
1999{
2000 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2001 kfree(snap->name);
2002 kfree(snap);
2003}
2004
2005static const struct attribute_group *rbd_snap_attr_groups[] = {
2006 &rbd_snap_attr_group,
2007 NULL
2008};
2009
2010static struct device_type rbd_snap_device_type = {
2011 .groups = rbd_snap_attr_groups,
2012 .release = rbd_snap_dev_release,
2013};
2014
2015static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
2016 struct rbd_snap *snap)
2017{
2018 list_del(&snap->node);
2019 device_unregister(&snap->dev);
2020}
2021
2022static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2023 struct rbd_snap *snap,
2024 struct device *parent)
2025{
2026 struct device *dev = &snap->dev;
2027 int ret;
2028
2029 dev->type = &rbd_snap_device_type;
2030 dev->parent = parent;
2031 dev->release = rbd_snap_dev_release;
2032 dev_set_name(dev, "snap_%s", snap->name);
2033 ret = device_register(dev);
2034
2035 return ret;
2036}
2037
2038static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2039 int i, const char *name,
2040 struct rbd_snap **snapp)
2041{
2042 int ret;
2043 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2044 if (!snap)
2045 return -ENOMEM;
2046 snap->name = kstrdup(name, GFP_KERNEL);
2047 snap->size = rbd_dev->header.snap_sizes[i];
2048 snap->id = rbd_dev->header.snapc->snaps[i];
2049 if (device_is_registered(&rbd_dev->dev)) {
2050 ret = rbd_register_snap_dev(rbd_dev, snap,
2051 &rbd_dev->dev);
2052 if (ret < 0)
2053 goto err;
2054 }
2055 *snapp = snap;
2056 return 0;
2057err:
2058 kfree(snap->name);
2059 kfree(snap);
2060 return ret;
2061}
2062
2063/*
2064 * search for the previous snap in a null delimited string list
2065 */
2066const char *rbd_prev_snap_name(const char *name, const char *start)
2067{
2068 if (name < start + 2)
2069 return NULL;
2070
2071 name -= 2;
2072 while (*name) {
2073 if (name == start)
2074 return start;
2075 name--;
2076 }
2077 return name + 1;
2078}
2079
2080/*
2081 * compare the old list of snapshots that we have to what's in the header
2082 * and update it accordingly. Note that the header holds the snapshots
2083 * in a reverse order (from newest to oldest) and we need to go from
2084 * older to new so that we don't get a duplicate snap name when
2085 * doing the process (e.g., removed snapshot and recreated a new
2086 * one with the same name.
2087 */
2088static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2089{
2090 const char *name, *first_name;
2091 int i = rbd_dev->header.total_snaps;
2092 struct rbd_snap *snap, *old_snap = NULL;
2093 int ret;
2094 struct list_head *p, *n;
2095
2096 first_name = rbd_dev->header.snap_names;
2097 name = first_name + rbd_dev->header.snap_names_len;
2098
2099 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2100 u64 cur_id;
2101
2102 old_snap = list_entry(p, struct rbd_snap, node);
2103
2104 if (i)
2105 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2106
2107 if (!i || old_snap->id < cur_id) {
Josh Durgine88a36e2011-11-21 18:14:25 -08002108 /*
2109 * old_snap->id was skipped, thus was
2110 * removed. If this rbd_dev is mapped to
2111 * the removed snapshot, record that it no
2112 * longer exists, to prevent further I/O.
2113 */
2114 if (rbd_dev->snap_id == old_snap->id)
2115 rbd_dev->snap_exists = false;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002116 __rbd_remove_snap_dev(rbd_dev, old_snap);
2117 continue;
2118 }
2119 if (old_snap->id == cur_id) {
2120 /* we have this snapshot already */
2121 i--;
2122 name = rbd_prev_snap_name(name, first_name);
2123 continue;
2124 }
2125 for (; i > 0;
2126 i--, name = rbd_prev_snap_name(name, first_name)) {
2127 if (!name) {
2128 WARN_ON(1);
2129 return -EINVAL;
2130 }
2131 cur_id = rbd_dev->header.snapc->snaps[i];
2132 /* snapshot removal? handle it above */
2133 if (cur_id >= old_snap->id)
2134 break;
2135 /* a new snapshot */
2136 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2137 if (ret < 0)
2138 return ret;
2139
2140 /* note that we add it backward so using n and not p */
2141 list_add(&snap->node, n);
2142 p = &snap->node;
2143 }
2144 }
2145 /* we're done going over the old snap list, just add what's left */
2146 for (; i > 0; i--) {
2147 name = rbd_prev_snap_name(name, first_name);
2148 if (!name) {
2149 WARN_ON(1);
2150 return -EINVAL;
2151 }
2152 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2153 if (ret < 0)
2154 return ret;
2155 list_add(&snap->node, &rbd_dev->snaps);
2156 }
2157
2158 return 0;
2159}
2160
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002161static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2162{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002163 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002164 struct device *dev;
2165 struct rbd_snap *snap;
2166
2167 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2168 dev = &rbd_dev->dev;
2169
2170 dev->bus = &rbd_bus_type;
2171 dev->type = &rbd_device_type;
2172 dev->parent = &rbd_root_dev;
2173 dev->release = rbd_dev_release;
2174 dev_set_name(dev, "%d", rbd_dev->id);
2175 ret = device_register(dev);
2176 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002177 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002178
2179 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2180 ret = rbd_register_snap_dev(rbd_dev, snap,
2181 &rbd_dev->dev);
2182 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002183 break;
2184 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002185out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002186 mutex_unlock(&ctl_mutex);
2187 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002188}
2189
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002190static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2191{
2192 device_unregister(&rbd_dev->dev);
2193}
2194
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002195static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2196{
2197 int ret, rc;
2198
2199 do {
Alex Elder0bed54d2012-07-03 16:01:18 -05002200 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002201 rbd_dev->header.obj_version);
2202 if (ret == -ERANGE) {
2203 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08002204 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002205 mutex_unlock(&ctl_mutex);
2206 if (rc < 0)
2207 return rc;
2208 }
2209 } while (ret == -ERANGE);
2210
2211 return ret;
2212}
2213
Alex Elder1ddbe942012-01-29 13:57:44 -06002214static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2215
2216/*
Alex Elder499afd52012-02-02 08:13:29 -06002217 * Get a unique rbd identifier for the given new rbd_dev, and add
2218 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002219 */
Alex Elder499afd52012-02-02 08:13:29 -06002220static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002221{
Alex Elder499afd52012-02-02 08:13:29 -06002222 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2223
2224 spin_lock(&rbd_dev_list_lock);
2225 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2226 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002227}
Alex Elderb7f23c32012-01-29 13:57:43 -06002228
Alex Elder1ddbe942012-01-29 13:57:44 -06002229/*
Alex Elder499afd52012-02-02 08:13:29 -06002230 * Remove an rbd_dev from the global list, and record that its
2231 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002232 */
Alex Elder499afd52012-02-02 08:13:29 -06002233static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002234{
Alex Elderd184f6b2012-01-29 13:57:44 -06002235 struct list_head *tmp;
2236 int rbd_id = rbd_dev->id;
2237 int max_id;
2238
2239 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002240
2241 spin_lock(&rbd_dev_list_lock);
2242 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002243
2244 /*
2245 * If the id being "put" is not the current maximum, there
2246 * is nothing special we need to do.
2247 */
2248 if (rbd_id != atomic64_read(&rbd_id_max)) {
2249 spin_unlock(&rbd_dev_list_lock);
2250 return;
2251 }
2252
2253 /*
2254 * We need to update the current maximum id. Search the
2255 * list to find out what it is. We're more likely to find
2256 * the maximum at the end, so search the list backward.
2257 */
2258 max_id = 0;
2259 list_for_each_prev(tmp, &rbd_dev_list) {
2260 struct rbd_device *rbd_dev;
2261
2262 rbd_dev = list_entry(tmp, struct rbd_device, node);
2263 if (rbd_id > max_id)
2264 max_id = rbd_id;
2265 }
Alex Elder499afd52012-02-02 08:13:29 -06002266 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002267
Alex Elder1ddbe942012-01-29 13:57:44 -06002268 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002269 * The max id could have been updated by rbd_id_get(), in
2270 * which case it now accurately reflects the new maximum.
2271 * Be careful not to overwrite the maximum value in that
2272 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002273 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002274 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002275}
2276
Alex Eldera725f65e2012-02-02 08:13:30 -06002277/*
Alex Eldere28fff262012-02-02 08:13:30 -06002278 * Skips over white space at *buf, and updates *buf to point to the
2279 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002280 * the token (string of non-white space characters) found. Note
2281 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002282 */
2283static inline size_t next_token(const char **buf)
2284{
2285 /*
2286 * These are the characters that produce nonzero for
2287 * isspace() in the "C" and "POSIX" locales.
2288 */
2289 const char *spaces = " \f\n\r\t\v";
2290
2291 *buf += strspn(*buf, spaces); /* Find start of token */
2292
2293 return strcspn(*buf, spaces); /* Return token length */
2294}
2295
2296/*
2297 * Finds the next token in *buf, and if the provided token buffer is
2298 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002299 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2300 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002301 *
2302 * Returns the length of the token found (not including the '\0').
2303 * Return value will be 0 if no token is found, and it will be >=
2304 * token_size if the token would not fit.
2305 *
Alex Elder593a9e72012-02-07 12:03:37 -06002306 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002307 * found token. Note that this occurs even if the token buffer is
2308 * too small to hold it.
2309 */
2310static inline size_t copy_token(const char **buf,
2311 char *token,
2312 size_t token_size)
2313{
2314 size_t len;
2315
2316 len = next_token(buf);
2317 if (len < token_size) {
2318 memcpy(token, *buf, len);
2319 *(token + len) = '\0';
2320 }
2321 *buf += len;
2322
2323 return len;
2324}
2325
2326/*
Alex Elderea3352f2012-07-09 21:04:23 -05002327 * Finds the next token in *buf, dynamically allocates a buffer big
2328 * enough to hold a copy of it, and copies the token into the new
2329 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2330 * that a duplicate buffer is created even for a zero-length token.
2331 *
2332 * Returns a pointer to the newly-allocated duplicate, or a null
2333 * pointer if memory for the duplicate was not available. If
2334 * the lenp argument is a non-null pointer, the length of the token
2335 * (not including the '\0') is returned in *lenp.
2336 *
2337 * If successful, the *buf pointer will be updated to point beyond
2338 * the end of the found token.
2339 *
2340 * Note: uses GFP_KERNEL for allocation.
2341 */
2342static inline char *dup_token(const char **buf, size_t *lenp)
2343{
2344 char *dup;
2345 size_t len;
2346
2347 len = next_token(buf);
2348 dup = kmalloc(len + 1, GFP_KERNEL);
2349 if (!dup)
2350 return NULL;
2351
2352 memcpy(dup, *buf, len);
2353 *(dup + len) = '\0';
2354 *buf += len;
2355
2356 if (lenp)
2357 *lenp = len;
2358
2359 return dup;
2360}
2361
2362/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002363 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002364 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2365 * on the list of monitor addresses and other options provided via
2366 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002367 *
2368 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002369 */
2370static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2371 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002372 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002373 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002374 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002375 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002376{
Alex Elderd22f76e2012-07-12 10:46:35 -05002377 size_t len;
2378 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002379
2380 /* The first four tokens are required */
2381
Alex Elder7ef32142012-02-02 08:13:30 -06002382 len = next_token(&buf);
2383 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002384 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002385 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002386 *mon_addrs = buf;
2387
2388 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002389
Alex Eldere28fff262012-02-02 08:13:30 -06002390 len = copy_token(&buf, options, options_size);
2391 if (!len || len >= options_size)
2392 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002393
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002394 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002395 rbd_dev->pool_name = dup_token(&buf, NULL);
2396 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002397 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002398
Alex Elder0bed54d2012-07-03 16:01:18 -05002399 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2400 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002401 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002402
Alex Eldercb8627c2012-07-09 21:04:23 -05002403 /* Create the name of the header object */
2404
Alex Elder0bed54d2012-07-03 16:01:18 -05002405 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002406 + sizeof (RBD_SUFFIX),
2407 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002408 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002409 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002410 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002411
Alex Eldere28fff262012-02-02 08:13:30 -06002412 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002413 * The snapshot name is optional. If none is is supplied,
2414 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002415 */
Alex Elder820a5f32012-07-09 21:04:24 -05002416 rbd_dev->snap_name = dup_token(&buf, &len);
2417 if (!rbd_dev->snap_name)
2418 goto out_err;
2419 if (!len) {
2420 /* Replace the empty name with the default */
2421 kfree(rbd_dev->snap_name);
2422 rbd_dev->snap_name
2423 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2424 if (!rbd_dev->snap_name)
2425 goto out_err;
2426
Alex Eldere28fff262012-02-02 08:13:30 -06002427 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2428 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002429 }
Alex Eldere28fff262012-02-02 08:13:30 -06002430
Alex Eldera725f65e2012-02-02 08:13:30 -06002431 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002432
2433out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002434 kfree(rbd_dev->header_name);
2435 kfree(rbd_dev->image_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002436 kfree(rbd_dev->pool_name);
2437 rbd_dev->pool_name = NULL;
2438
2439 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002440}
2441
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002442static ssize_t rbd_add(struct bus_type *bus,
2443 const char *buf,
2444 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002445{
Alex Eldercb8627c2012-07-09 21:04:23 -05002446 char *options;
2447 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002448 const char *mon_addrs = NULL;
2449 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002450 struct ceph_osd_client *osdc;
2451 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002452
2453 if (!try_module_get(THIS_MODULE))
2454 return -ENODEV;
2455
Alex Elder27cc2592012-02-02 08:13:30 -06002456 options = kmalloc(count, GFP_KERNEL);
2457 if (!options)
2458 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002459 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2460 if (!rbd_dev)
2461 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002462
2463 /* static rbd_device initialization */
2464 spin_lock_init(&rbd_dev->lock);
2465 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002466 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002467 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002468
Josh Durginc6666012011-11-21 17:11:12 -08002469 init_rwsem(&rbd_dev->header_rwsem);
Alex Elder0e805a12012-01-11 19:42:15 -08002470
Alex Elderd184f6b2012-01-29 13:57:44 -06002471 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002472 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002473
Alex Eldera725f65e2012-02-02 08:13:30 -06002474 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002475 BUILD_BUG_ON(DEV_NAME_LEN
2476 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2477 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
Alex Eldere124a822012-01-29 13:57:44 -06002478
Alex Eldera725f65e2012-02-02 08:13:30 -06002479 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002480 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002481 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002482 if (rc)
2483 goto err_put_id;
2484
Alex Elder5214ecc2012-02-02 08:13:30 -06002485 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2486 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002487 if (IS_ERR(rbd_dev->rbd_client)) {
2488 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002489 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002490 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002491
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002492 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002493 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002494 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2495 if (rc < 0)
2496 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002497 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002498
2499 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002500 rc = register_blkdev(0, rbd_dev->name);
2501 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002502 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002503 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002504
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002505 rc = rbd_bus_add_dev(rbd_dev);
2506 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002507 goto err_out_blkdev;
2508
Alex Elder32eec682012-02-08 16:11:14 -06002509 /*
2510 * At this point cleanup in the event of an error is the job
2511 * of the sysfs code (initiated by rbd_bus_del_dev()).
2512 *
2513 * Set up and announce blkdev mapping.
2514 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002515 rc = rbd_init_disk(rbd_dev);
2516 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002517 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002518
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002519 rc = rbd_init_watch_dev(rbd_dev);
2520 if (rc)
2521 goto err_out_bus;
2522
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002523 return count;
2524
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002525err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002526 /* this will also clean up rest of rbd_dev stuff */
2527
2528 rbd_bus_del_dev(rbd_dev);
2529 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002530 return rc;
2531
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002532err_out_blkdev:
2533 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2534err_out_client:
2535 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002536err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002537 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002538 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002539 kfree(rbd_dev->header_name);
2540 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002541 kfree(rbd_dev->pool_name);
2542 }
Alex Elder499afd52012-02-02 08:13:29 -06002543 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002544err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002545 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002546 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002547
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002548 dout("Error adding device %s\n", buf);
2549 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002550
2551 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002552}
2553
2554static struct rbd_device *__rbd_get_dev(unsigned long id)
2555{
2556 struct list_head *tmp;
2557 struct rbd_device *rbd_dev;
2558
Alex Eldere124a822012-01-29 13:57:44 -06002559 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002560 list_for_each(tmp, &rbd_dev_list) {
2561 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a822012-01-29 13:57:44 -06002562 if (rbd_dev->id == id) {
2563 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002564 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002565 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002566 }
Alex Eldere124a822012-01-29 13:57:44 -06002567 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002568 return NULL;
2569}
2570
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002571static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002572{
Alex Elder593a9e72012-02-07 12:03:37 -06002573 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002574
Alex Elder1dbb4392012-01-24 10:08:37 -06002575 if (rbd_dev->watch_request) {
2576 struct ceph_client *client = rbd_dev->rbd_client->client;
2577
2578 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002579 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002580 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002581 if (rbd_dev->watch_event)
Alex Elder0bed54d2012-07-03 16:01:18 -05002582 rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002583
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002584 rbd_put_client(rbd_dev);
2585
2586 /* clean up and free blkdev */
2587 rbd_free_disk(rbd_dev);
2588 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002589
2590 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002591 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002592 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002593 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002594 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002595 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002596 kfree(rbd_dev);
2597
2598 /* release module ref */
2599 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002600}
2601
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002602static ssize_t rbd_remove(struct bus_type *bus,
2603 const char *buf,
2604 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002605{
2606 struct rbd_device *rbd_dev = NULL;
2607 int target_id, rc;
2608 unsigned long ul;
2609 int ret = count;
2610
2611 rc = strict_strtoul(buf, 10, &ul);
2612 if (rc)
2613 return rc;
2614
2615 /* convert to int; abort if we lost anything in the conversion */
2616 target_id = (int) ul;
2617 if (target_id != ul)
2618 return -EINVAL;
2619
2620 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2621
2622 rbd_dev = __rbd_get_dev(target_id);
2623 if (!rbd_dev) {
2624 ret = -ENOENT;
2625 goto done;
2626 }
2627
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002628 __rbd_remove_all_snaps(rbd_dev);
2629 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002630
2631done:
2632 mutex_unlock(&ctl_mutex);
2633 return ret;
2634}
2635
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002636static ssize_t rbd_snap_add(struct device *dev,
2637 struct device_attribute *attr,
2638 const char *buf,
2639 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002640{
Alex Elder593a9e72012-02-07 12:03:37 -06002641 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002642 int ret;
2643 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002644 if (!name)
2645 return -ENOMEM;
2646
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002647 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002648
2649 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2650
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002651 ret = rbd_header_add_snap(rbd_dev,
2652 name, GFP_KERNEL);
2653 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002654 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002655
Josh Durgin263c6ca2011-12-05 10:43:42 -08002656 ret = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002657 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002658 goto err_unlock;
2659
2660 /* shouldn't hold ctl_mutex when notifying.. notify might
2661 trigger a watch callback that would need to get that mutex */
2662 mutex_unlock(&ctl_mutex);
2663
2664 /* make a best effort, don't error if failed */
Alex Elder0bed54d2012-07-03 16:01:18 -05002665 rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666
2667 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002668 kfree(name);
2669 return ret;
2670
2671err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002672 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002673 kfree(name);
2674 return ret;
2675}
2676
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002677/*
2678 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002679 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002680 */
2681static int rbd_sysfs_init(void)
2682{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002683 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002684
Alex Elderfed4c142012-02-07 12:03:36 -06002685 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002686 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002687 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002688
Alex Elderfed4c142012-02-07 12:03:36 -06002689 ret = bus_register(&rbd_bus_type);
2690 if (ret < 0)
2691 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002692
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002693 return ret;
2694}
2695
2696static void rbd_sysfs_cleanup(void)
2697{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002698 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002699 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002700}
2701
2702int __init rbd_init(void)
2703{
2704 int rc;
2705
2706 rc = rbd_sysfs_init();
2707 if (rc)
2708 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002709 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002710 return 0;
2711}
2712
2713void __exit rbd_exit(void)
2714{
2715 rbd_sysfs_cleanup();
2716}
2717
2718module_init(rbd_init);
2719module_exit(rbd_exit);
2720
2721MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2722MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2723MODULE_DESCRIPTION("rados block device");
2724
2725/* following authorship retained from original osdblk.c */
2726MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2727
2728MODULE_LICENSE("GPL");