blob: 7d4735c9dba59231ffe570c2425977a153a17c35 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
85 u64 snap_seq;
86 u32 total_snaps;
87
88 char *snap_names;
89 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090
91 u64 obj_version;
92};
93
94struct rbd_options {
95 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096};
97
98/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060099 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700100 */
101struct rbd_client {
102 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700103 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104 struct kref kref;
105 struct list_head node;
106};
107
108/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600109 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700110 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700111struct rbd_req_status {
112 int done;
113 int rc;
114 u64 bytes;
115};
116
117/*
118 * a collection of requests
119 */
120struct rbd_req_coll {
121 int total;
122 int num_done;
123 struct kref kref;
124 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700125};
126
Alex Elderf0f8cef2012-01-29 13:57:44 -0600127/*
128 * a single io request
129 */
130struct rbd_request {
131 struct request *rq; /* blk layer request */
132 struct bio *bio; /* cloned bio */
133 struct page **pages; /* list of used pages */
134 u64 len;
135 int coll_index;
136 struct rbd_req_coll *coll;
137};
138
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800139struct rbd_snap {
140 struct device dev;
141 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800142 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800143 struct list_head node;
144 u64 id;
145};
146
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700147/*
148 * a single device
149 */
150struct rbd_device {
151 int id; /* blkdev unique id */
152
153 int major; /* blkdev assigned major */
154 struct gendisk *disk; /* blkdev's gendisk and rq */
155 struct request_queue *q;
156
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157 struct rbd_client *rbd_client;
158
159 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
160
161 spinlock_t lock; /* queue lock */
162
163 struct rbd_image_header header;
Alex Elderbf3e5ae2012-07-09 21:04:23 -0500164 char *obj; /* rbd image name */
165 size_t obj_len;
Alex Eldercb8627c2012-07-09 21:04:23 -0500166 char *obj_md_name; /* hdr nm. */
Alex Elderd22f76e2012-07-12 10:46:35 -0500167 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500168 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700170 struct ceph_osd_event *watch_event;
171 struct ceph_osd_request *watch_request;
172
Josh Durginc6666012011-11-21 17:11:12 -0800173 /* protects updating the header */
174 struct rw_semaphore header_rwsem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700175 char snap_name[RBD_MAX_SNAP_NAME_LEN];
Josh Durgin77dfe992011-11-21 13:04:42 -0800176 u64 snap_id; /* current snapshot id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700177 int read_only;
178
179 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800180
181 /* list of snapshots */
182 struct list_head snaps;
183
184 /* sysfs related */
185 struct device dev;
186};
187
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600189
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700190static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600191static DEFINE_SPINLOCK(rbd_dev_list_lock);
192
Alex Elder432b8582012-01-29 13:57:44 -0600193static LIST_HEAD(rbd_client_list); /* clients */
194static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700195
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800196static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
197static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800198static ssize_t rbd_snap_add(struct device *dev,
199 struct device_attribute *attr,
200 const char *buf,
201 size_t count);
202static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700203 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800204
Alex Elderf0f8cef2012-01-29 13:57:44 -0600205static ssize_t rbd_add(struct bus_type *bus, const char *buf,
206 size_t count);
207static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
208 size_t count);
209
210static struct bus_attribute rbd_bus_attrs[] = {
211 __ATTR(add, S_IWUSR, NULL, rbd_add),
212 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
213 __ATTR_NULL
214};
215
216static struct bus_type rbd_bus_type = {
217 .name = "rbd",
218 .bus_attrs = rbd_bus_attrs,
219};
220
221static void rbd_root_dev_release(struct device *dev)
222{
223}
224
225static struct device rbd_root_dev = {
226 .init_name = "rbd",
227 .release = rbd_root_dev_release,
228};
229
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
232{
233 return get_device(&rbd_dev->dev);
234}
235
236static void rbd_put_dev(struct rbd_device *rbd_dev)
237{
238 put_device(&rbd_dev->dev);
239}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700240
Josh Durgin263c6ca2011-12-05 10:43:42 -0800241static int __rbd_refresh_header(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700242
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700243static int rbd_open(struct block_device *bdev, fmode_t mode)
244{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600245 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700246
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800247 rbd_get_dev(rbd_dev);
248
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249 set_device_ro(bdev, rbd_dev->read_only);
250
251 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
252 return -EROFS;
253
254 return 0;
255}
256
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800257static int rbd_release(struct gendisk *disk, fmode_t mode)
258{
259 struct rbd_device *rbd_dev = disk->private_data;
260
261 rbd_put_dev(rbd_dev);
262
263 return 0;
264}
265
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700266static const struct block_device_operations rbd_bd_ops = {
267 .owner = THIS_MODULE,
268 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800269 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270};
271
272/*
273 * Initialize an rbd client instance.
274 * We own *opt.
275 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700276static struct rbd_client *rbd_client_create(struct ceph_options *opt,
277 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700278{
279 struct rbd_client *rbdc;
280 int ret = -ENOMEM;
281
282 dout("rbd_client_create\n");
283 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
284 if (!rbdc)
285 goto out_opt;
286
287 kref_init(&rbdc->kref);
288 INIT_LIST_HEAD(&rbdc->node);
289
Alex Elderbc534d82012-01-29 13:57:44 -0600290 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
291
Sage Weil6ab00d42011-08-09 09:41:59 -0700292 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600294 goto out_mutex;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400295 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296
297 ret = ceph_open_session(rbdc->client);
298 if (ret < 0)
299 goto out_err;
300
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700301 rbdc->rbd_opts = rbd_opts;
302
Alex Elder432b8582012-01-29 13:57:44 -0600303 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600305 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306
Alex Elderbc534d82012-01-29 13:57:44 -0600307 mutex_unlock(&ctl_mutex);
308
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309 dout("rbd_client_create created %p\n", rbdc);
310 return rbdc;
311
312out_err:
313 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600314out_mutex:
315 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316 kfree(rbdc);
317out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400318 if (opt)
319 ceph_destroy_options(opt);
320 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700321}
322
323/*
324 * Find a ceph client with specific addr and configuration.
325 */
326static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
327{
328 struct rbd_client *client_node;
329
330 if (opt->flags & CEPH_OPT_NOSHARE)
331 return NULL;
332
333 list_for_each_entry(client_node, &rbd_client_list, node)
334 if (ceph_compare_options(opt, client_node->client) == 0)
335 return client_node;
336 return NULL;
337}
338
339/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700340 * mount options
341 */
342enum {
343 Opt_notify_timeout,
344 Opt_last_int,
345 /* int args above */
346 Opt_last_string,
347 /* string args above */
348};
349
350static match_table_t rbdopt_tokens = {
351 {Opt_notify_timeout, "notify_timeout=%d"},
352 /* int args above */
353 /* string args above */
354 {-1, NULL}
355};
356
357static int parse_rbd_opts_token(char *c, void *private)
358{
359 struct rbd_options *rbdopt = private;
360 substring_t argstr[MAX_OPT_ARGS];
361 int token, intval, ret;
362
Alex Elder21079782012-01-24 10:08:36 -0600363 token = match_token(c, rbdopt_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700364 if (token < 0)
365 return -EINVAL;
366
367 if (token < Opt_last_int) {
368 ret = match_int(&argstr[0], &intval);
369 if (ret < 0) {
370 pr_err("bad mount option arg (not int) "
371 "at '%s'\n", c);
372 return ret;
373 }
374 dout("got int token %d val %d\n", token, intval);
375 } else if (token > Opt_last_int && token < Opt_last_string) {
376 dout("got string token %d val %s\n", token,
377 argstr[0].from);
378 } else {
379 dout("got token %d\n", token);
380 }
381
382 switch (token) {
383 case Opt_notify_timeout:
384 rbdopt->notify_timeout = intval;
385 break;
386 default:
387 BUG_ON(token);
388 }
389 return 0;
390}
391
392/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700393 * Get a ceph client with specific addr and configuration, if one does
394 * not exist create it.
395 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600396static struct rbd_client *rbd_get_client(const char *mon_addr,
397 size_t mon_addr_len,
398 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700399{
400 struct rbd_client *rbdc;
401 struct ceph_options *opt;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700402 struct rbd_options *rbd_opts;
403
404 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
405 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600406 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700407
408 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700409
Alex Elderee577412012-01-24 10:08:36 -0600410 opt = ceph_parse_options(options, mon_addr,
Alex Elder5214ecc2012-02-02 08:13:30 -0600411 mon_addr + mon_addr_len,
Alex Elder21079782012-01-24 10:08:36 -0600412 parse_rbd_opts_token, rbd_opts);
Alex Elderee577412012-01-24 10:08:36 -0600413 if (IS_ERR(opt)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600414 kfree(rbd_opts);
415 return ERR_CAST(opt);
Alex Elderee577412012-01-24 10:08:36 -0600416 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417
Alex Elder432b8582012-01-29 13:57:44 -0600418 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419 rbdc = __rbd_client_find(opt);
420 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600421 /* using an existing client */
422 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600423 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600424
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700425 ceph_destroy_options(opt);
Alex Elder97bb59a2012-01-24 10:08:36 -0600426 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700427
Alex Elderd720bcb2012-02-02 08:13:30 -0600428 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700429 }
Alex Elder432b8582012-01-29 13:57:44 -0600430 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700432 rbdc = rbd_client_create(opt, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600433
Alex Elderd720bcb2012-02-02 08:13:30 -0600434 if (IS_ERR(rbdc))
435 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436
Alex Elderd720bcb2012-02-02 08:13:30 -0600437 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438}
439
440/*
441 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600442 *
Alex Elder432b8582012-01-29 13:57:44 -0600443 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444 */
445static void rbd_client_release(struct kref *kref)
446{
447 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
448
449 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500450 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500452 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453
454 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700455 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456 kfree(rbdc);
457}
458
459/*
460 * Drop reference to ceph client node. If it's not referenced anymore, release
461 * it.
462 */
463static void rbd_put_client(struct rbd_device *rbd_dev)
464{
465 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
466 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467}
468
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700469/*
470 * Destroy requests collection
471 */
472static void rbd_coll_release(struct kref *kref)
473{
474 struct rbd_req_coll *coll =
475 container_of(kref, struct rbd_req_coll, kref);
476
477 dout("rbd_coll_release %p\n", coll);
478 kfree(coll);
479}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700480
481/*
482 * Create a new header structure, translate header format from the on-disk
483 * header.
484 */
485static int rbd_header_from_disk(struct rbd_image_header *header,
486 struct rbd_image_header_ondisk *ondisk,
Xi Wang50f7c4c2012-04-20 15:49:44 -0500487 u32 allocated_snaps,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488 gfp_t gfp_flags)
489{
Xi Wang50f7c4c2012-04-20 15:49:44 -0500490 u32 i, snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700491
Alex Elder21079782012-01-24 10:08:36 -0600492 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800493 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800494
Alex Elder00f1f362012-02-07 12:03:36 -0600495 snap_count = le32_to_cpu(ondisk->snap_count);
Xi Wang50f7c4c2012-04-20 15:49:44 -0500496 if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
497 / sizeof (*ondisk))
498 return -EINVAL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700499 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Yan, Zhengf9f9a192012-06-06 09:15:33 -0500500 snap_count * sizeof(u64),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700501 gfp_flags);
502 if (!header->snapc)
503 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600504
Alex Elder00f1f362012-02-07 12:03:36 -0600505 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700506 if (snap_count) {
507 header->snap_names = kmalloc(header->snap_names_len,
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500508 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700509 if (!header->snap_names)
510 goto err_snapc;
511 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500512 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700513 if (!header->snap_sizes)
514 goto err_names;
515 } else {
516 header->snap_names = NULL;
517 header->snap_sizes = NULL;
518 }
Alex Elder849b4262012-07-09 21:04:24 -0500519
520 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
521 gfp_flags);
522 if (!header->object_prefix)
523 goto err_sizes;
524
Alex Elderca1e49a2012-07-10 20:30:09 -0500525 memcpy(header->object_prefix, ondisk->block_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700526 sizeof(ondisk->block_name));
Alex Elder849b4262012-07-09 21:04:24 -0500527 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700528
529 header->image_size = le64_to_cpu(ondisk->image_size);
530 header->obj_order = ondisk->options.order;
531 header->crypt_type = ondisk->options.crypt_type;
532 header->comp_type = ondisk->options.comp_type;
533
534 atomic_set(&header->snapc->nref, 1);
535 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
536 header->snapc->num_snaps = snap_count;
537 header->total_snaps = snap_count;
538
Alex Elder21079782012-01-24 10:08:36 -0600539 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540 for (i = 0; i < snap_count; i++) {
541 header->snapc->snaps[i] =
542 le64_to_cpu(ondisk->snaps[i].id);
543 header->snap_sizes[i] =
544 le64_to_cpu(ondisk->snaps[i].image_size);
545 }
546
547 /* copy snapshot names */
548 memcpy(header->snap_names, &ondisk->snaps[i],
549 header->snap_names_len);
550 }
551
552 return 0;
553
Alex Elder849b4262012-07-09 21:04:24 -0500554err_sizes:
555 kfree(header->snap_sizes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700556err_names:
557 kfree(header->snap_names);
558err_snapc:
559 kfree(header->snapc);
Alex Elder00f1f362012-02-07 12:03:36 -0600560 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700561}
562
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700563static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
564 u64 *seq, u64 *size)
565{
566 int i;
567 char *p = header->snap_names;
568
Alex Elder00f1f362012-02-07 12:03:36 -0600569 for (i = 0; i < header->total_snaps; i++) {
570 if (!strcmp(snap_name, p)) {
571
572 /* Found it. Pass back its id and/or size */
573
574 if (seq)
575 *seq = header->snapc->snaps[i];
576 if (size)
577 *size = header->snap_sizes[i];
578 return i;
579 }
580 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700581 }
Alex Elder00f1f362012-02-07 12:03:36 -0600582 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700583}
584
Josh Durgincc9d7342011-11-21 18:19:13 -0800585static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586{
587 struct rbd_image_header *header = &dev->header;
588 struct ceph_snap_context *snapc = header->snapc;
589 int ret = -ENOENT;
590
Josh Durgincc9d7342011-11-21 18:19:13 -0800591 BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
592
Josh Durginc6666012011-11-21 17:11:12 -0800593 down_write(&dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700594
Josh Durgincc9d7342011-11-21 18:19:13 -0800595 if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
596 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597 if (header->total_snaps)
598 snapc->seq = header->snap_seq;
599 else
600 snapc->seq = 0;
Josh Durgin77dfe992011-11-21 13:04:42 -0800601 dev->snap_id = CEPH_NOSNAP;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700602 dev->read_only = 0;
603 if (size)
604 *size = header->image_size;
605 } else {
Josh Durgincc9d7342011-11-21 18:19:13 -0800606 ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700607 if (ret < 0)
608 goto done;
Josh Durgin77dfe992011-11-21 13:04:42 -0800609 dev->snap_id = snapc->seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610 dev->read_only = 1;
611 }
612
613 ret = 0;
614done:
Josh Durginc6666012011-11-21 17:11:12 -0800615 up_write(&dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616 return ret;
617}
618
619static void rbd_header_free(struct rbd_image_header *header)
620{
Alex Elder849b4262012-07-09 21:04:24 -0500621 kfree(header->object_prefix);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622 kfree(header->snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -0500623 kfree(header->snap_names);
624 kfree(header->snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700625}
626
627/*
628 * get the actual striped segment name, offset and length
629 */
630static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500631 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632 u64 ofs, u64 len,
633 char *seg_name, u64 *segofs)
634{
635 u64 seg = ofs >> header->obj_order;
636
637 if (seg_name)
638 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500639 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700640
641 ofs = ofs & ((1 << header->obj_order) - 1);
642 len = min_t(u64, len, (1 << header->obj_order) - ofs);
643
644 if (segofs)
645 *segofs = ofs;
646
647 return len;
648}
649
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700650static int rbd_get_num_segments(struct rbd_image_header *header,
651 u64 ofs, u64 len)
652{
653 u64 start_seg = ofs >> header->obj_order;
654 u64 end_seg = (ofs + len - 1) >> header->obj_order;
655 return end_seg - start_seg + 1;
656}
657
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700659 * returns the size of an object in the image
660 */
661static u64 rbd_obj_bytes(struct rbd_image_header *header)
662{
663 return 1 << header->obj_order;
664}
665
666/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 * bio helpers
668 */
669
670static void bio_chain_put(struct bio *chain)
671{
672 struct bio *tmp;
673
674 while (chain) {
675 tmp = chain;
676 chain = chain->bi_next;
677 bio_put(tmp);
678 }
679}
680
681/*
682 * zeros a bio chain, starting at specific offset
683 */
684static void zero_bio_chain(struct bio *chain, int start_ofs)
685{
686 struct bio_vec *bv;
687 unsigned long flags;
688 void *buf;
689 int i;
690 int pos = 0;
691
692 while (chain) {
693 bio_for_each_segment(bv, chain, i) {
694 if (pos + bv->bv_len > start_ofs) {
695 int remainder = max(start_ofs - pos, 0);
696 buf = bvec_kmap_irq(bv, &flags);
697 memset(buf + remainder, 0,
698 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200699 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700 }
701 pos += bv->bv_len;
702 }
703
704 chain = chain->bi_next;
705 }
706}
707
708/*
709 * bio_chain_clone - clone a chain of bios up to a certain length.
710 * might return a bio_pair that will need to be released.
711 */
712static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
713 struct bio_pair **bp,
714 int len, gfp_t gfpmask)
715{
716 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
717 int total = 0;
718
719 if (*bp) {
720 bio_pair_release(*bp);
721 *bp = NULL;
722 }
723
724 while (old_chain && (total < len)) {
725 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
726 if (!tmp)
727 goto err_out;
728
729 if (total + old_chain->bi_size > len) {
730 struct bio_pair *bp;
731
732 /*
733 * this split can only happen with a single paged bio,
734 * split_bio will BUG_ON if this is not the case
735 */
736 dout("bio_chain_clone split! total=%d remaining=%d"
737 "bi_size=%d\n",
738 (int)total, (int)len-total,
739 (int)old_chain->bi_size);
740
741 /* split the bio. We'll release it either in the next
742 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600743 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744 if (!bp)
745 goto err_out;
746
747 __bio_clone(tmp, &bp->bio1);
748
749 *next = &bp->bio2;
750 } else {
751 __bio_clone(tmp, old_chain);
752 *next = old_chain->bi_next;
753 }
754
755 tmp->bi_bdev = NULL;
756 gfpmask &= ~__GFP_WAIT;
757 tmp->bi_next = NULL;
758
759 if (!new_chain) {
760 new_chain = tail = tmp;
761 } else {
762 tail->bi_next = tmp;
763 tail = tmp;
764 }
765 old_chain = old_chain->bi_next;
766
767 total += tmp->bi_size;
768 }
769
770 BUG_ON(total < len);
771
772 if (tail)
773 tail->bi_next = NULL;
774
775 *old = old_chain;
776
777 return new_chain;
778
779err_out:
780 dout("bio_chain_clone with err\n");
781 bio_chain_put(new_chain);
782 return NULL;
783}
784
785/*
786 * helpers for osd request op vectors.
787 */
788static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
789 int num_ops,
790 int opcode,
791 u32 payload_len)
792{
793 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
794 GFP_NOIO);
795 if (!*ops)
796 return -ENOMEM;
797 (*ops)[0].op = opcode;
798 /*
799 * op extent offset and length will be set later on
800 * in calc_raw_layout()
801 */
802 (*ops)[0].payload_len = payload_len;
803 return 0;
804}
805
806static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
807{
808 kfree(ops);
809}
810
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700811static void rbd_coll_end_req_index(struct request *rq,
812 struct rbd_req_coll *coll,
813 int index,
814 int ret, u64 len)
815{
816 struct request_queue *q;
817 int min, max, i;
818
819 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
820 coll, index, ret, len);
821
822 if (!rq)
823 return;
824
825 if (!coll) {
826 blk_end_request(rq, ret, len);
827 return;
828 }
829
830 q = rq->q;
831
832 spin_lock_irq(q->queue_lock);
833 coll->status[index].done = 1;
834 coll->status[index].rc = ret;
835 coll->status[index].bytes = len;
836 max = min = coll->num_done;
837 while (max < coll->total && coll->status[max].done)
838 max++;
839
840 for (i = min; i<max; i++) {
841 __blk_end_request(rq, coll->status[i].rc,
842 coll->status[i].bytes);
843 coll->num_done++;
844 kref_put(&coll->kref, rbd_coll_release);
845 }
846 spin_unlock_irq(q->queue_lock);
847}
848
849static void rbd_coll_end_req(struct rbd_request *req,
850 int ret, u64 len)
851{
852 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
853}
854
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700855/*
856 * Send ceph osd request
857 */
858static int rbd_do_request(struct request *rq,
859 struct rbd_device *dev,
860 struct ceph_snap_context *snapc,
861 u64 snapid,
862 const char *obj, u64 ofs, u64 len,
863 struct bio *bio,
864 struct page **pages,
865 int num_pages,
866 int flags,
867 struct ceph_osd_req_op *ops,
868 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700869 struct rbd_req_coll *coll,
870 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700871 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700872 struct ceph_msg *msg),
873 struct ceph_osd_request **linger_req,
874 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700875{
876 struct ceph_osd_request *req;
877 struct ceph_file_layout *layout;
878 int ret;
879 u64 bno;
880 struct timespec mtime = CURRENT_TIME;
881 struct rbd_request *req_data;
882 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600883 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700886 if (!req_data) {
887 if (coll)
888 rbd_coll_end_req_index(rq, coll, coll_index,
889 -ENOMEM, len);
890 return -ENOMEM;
891 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700893 if (coll) {
894 req_data->coll = coll;
895 req_data->coll_index = coll_index;
896 }
897
898 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700899
Josh Durginc6666012011-11-21 17:11:12 -0800900 down_read(&dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901
Alex Elder1dbb4392012-01-24 10:08:37 -0600902 osdc = &dev->rbd_client->client->osdc;
903 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
904 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700905 if (!req) {
Josh Durginc6666012011-11-21 17:11:12 -0800906 up_read(&dev->header_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700907 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700908 goto done_pages;
909 }
910
911 req->r_callback = rbd_cb;
912
913 req_data->rq = rq;
914 req_data->bio = bio;
915 req_data->pages = pages;
916 req_data->len = len;
917
918 req->r_priv = req_data;
919
920 reqhead = req->r_request->front.iov_base;
921 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
922
923 strncpy(req->r_oid, obj, sizeof(req->r_oid));
924 req->r_oid_len = strlen(req->r_oid);
925
926 layout = &req->r_file_layout;
927 memset(layout, 0, sizeof(*layout));
928 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
929 layout->fl_stripe_count = cpu_to_le32(1);
930 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder9bb2f332012-07-12 10:46:35 -0500931 layout->fl_pg_pool = cpu_to_le32(dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600932 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
933 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700934
935 ceph_osdc_build_request(req, ofs, &len,
936 ops,
937 snapc,
938 &mtime,
939 req->r_oid, req->r_oid_len);
Josh Durginc6666012011-11-21 17:11:12 -0800940 up_read(&dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700941
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700942 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600943 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700944 *linger_req = req;
945 }
946
Alex Elder1dbb4392012-01-24 10:08:37 -0600947 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700948 if (ret < 0)
949 goto done_err;
950
951 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600952 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700953 if (ver)
954 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700955 dout("reassert_ver=%lld\n",
956 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700957 ceph_osdc_put_request(req);
958 }
959 return ret;
960
961done_err:
962 bio_chain_put(req_data->bio);
963 ceph_osdc_put_request(req);
964done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700965 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 return ret;
968}
969
970/*
971 * Ceph osd op callback
972 */
973static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
974{
975 struct rbd_request *req_data = req->r_priv;
976 struct ceph_osd_reply_head *replyhead;
977 struct ceph_osd_op *op;
978 __s32 rc;
979 u64 bytes;
980 int read_op;
981
982 /* parse reply */
983 replyhead = msg->front.iov_base;
984 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
985 op = (void *)(replyhead + 1);
986 rc = le32_to_cpu(replyhead->result);
987 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -0500988 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700989
990 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
991
992 if (rc == -ENOENT && read_op) {
993 zero_bio_chain(req_data->bio, 0);
994 rc = 0;
995 } else if (rc == 0 && read_op && bytes < req_data->len) {
996 zero_bio_chain(req_data->bio, bytes);
997 bytes = req_data->len;
998 }
999
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001000 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001001
1002 if (req_data->bio)
1003 bio_chain_put(req_data->bio);
1004
1005 ceph_osdc_put_request(req);
1006 kfree(req_data);
1007}
1008
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001009static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1010{
1011 ceph_osdc_put_request(req);
1012}
1013
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001014/*
1015 * Do a synchronous ceph osd operation
1016 */
1017static int rbd_req_sync_op(struct rbd_device *dev,
1018 struct ceph_snap_context *snapc,
1019 u64 snapid,
1020 int opcode,
1021 int flags,
1022 struct ceph_osd_req_op *orig_ops,
1023 int num_reply,
1024 const char *obj,
1025 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001026 char *buf,
1027 struct ceph_osd_request **linger_req,
1028 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001029{
1030 int ret;
1031 struct page **pages;
1032 int num_pages;
1033 struct ceph_osd_req_op *ops = orig_ops;
1034 u32 payload_len;
1035
1036 num_pages = calc_pages_for(ofs , len);
1037 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001038 if (IS_ERR(pages))
1039 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040
1041 if (!orig_ops) {
1042 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1043 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1044 if (ret < 0)
1045 goto done;
1046
1047 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1048 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1049 if (ret < 0)
1050 goto done_ops;
1051 }
1052 }
1053
1054 ret = rbd_do_request(NULL, dev, snapc, snapid,
1055 obj, ofs, len, NULL,
1056 pages, num_pages,
1057 flags,
1058 ops,
1059 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001060 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001061 NULL,
1062 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001063 if (ret < 0)
1064 goto done_ops;
1065
1066 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1067 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1068
1069done_ops:
1070 if (!orig_ops)
1071 rbd_destroy_ops(ops);
1072done:
1073 ceph_release_page_vector(pages, num_pages);
1074 return ret;
1075}
1076
1077/*
1078 * Do an asynchronous ceph osd operation
1079 */
1080static int rbd_do_op(struct request *rq,
1081 struct rbd_device *rbd_dev ,
1082 struct ceph_snap_context *snapc,
1083 u64 snapid,
1084 int opcode, int flags, int num_reply,
1085 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001086 struct bio *bio,
1087 struct rbd_req_coll *coll,
1088 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001089{
1090 char *seg_name;
1091 u64 seg_ofs;
1092 u64 seg_len;
1093 int ret;
1094 struct ceph_osd_req_op *ops;
1095 u32 payload_len;
1096
1097 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1098 if (!seg_name)
1099 return -ENOMEM;
1100
1101 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001102 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103 ofs, len,
1104 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001105
1106 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1107
1108 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1109 if (ret < 0)
1110 goto done;
1111
1112 /* we've taken care of segment sizes earlier when we
1113 cloned the bios. We should never have a segment
1114 truncated at this point */
1115 BUG_ON(seg_len < len);
1116
1117 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1118 seg_name, seg_ofs, seg_len,
1119 bio,
1120 NULL, 0,
1121 flags,
1122 ops,
1123 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001124 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001125 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001126
1127 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001128done:
1129 kfree(seg_name);
1130 return ret;
1131}
1132
1133/*
1134 * Request async osd write
1135 */
1136static int rbd_req_write(struct request *rq,
1137 struct rbd_device *rbd_dev,
1138 struct ceph_snap_context *snapc,
1139 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001140 struct bio *bio,
1141 struct rbd_req_coll *coll,
1142 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143{
1144 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1145 CEPH_OSD_OP_WRITE,
1146 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1147 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001148 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149}
1150
1151/*
1152 * Request async osd read
1153 */
1154static int rbd_req_read(struct request *rq,
1155 struct rbd_device *rbd_dev,
1156 u64 snapid,
1157 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001158 struct bio *bio,
1159 struct rbd_req_coll *coll,
1160 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161{
1162 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001163 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164 CEPH_OSD_OP_READ,
1165 CEPH_OSD_FLAG_READ,
1166 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001167 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168}
1169
1170/*
1171 * Request sync osd read
1172 */
1173static int rbd_req_sync_read(struct rbd_device *dev,
1174 struct ceph_snap_context *snapc,
1175 u64 snapid,
1176 const char *obj,
1177 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001178 char *buf,
1179 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180{
1181 return rbd_req_sync_op(dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001182 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001183 CEPH_OSD_OP_READ,
1184 CEPH_OSD_FLAG_READ,
1185 NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001186 1, obj, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001187}
1188
1189/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001190 * Request sync osd watch
1191 */
1192static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1193 u64 ver,
1194 u64 notify_id,
1195 const char *obj)
1196{
1197 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001198 int ret;
1199
1200 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001201 if (ret < 0)
1202 return ret;
1203
1204 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1205 ops[0].watch.cookie = notify_id;
1206 ops[0].watch.flag = 0;
1207
1208 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1209 obj, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001210 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001211 CEPH_OSD_FLAG_READ,
1212 ops,
1213 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001214 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001215 rbd_simple_req_cb, 0, NULL);
1216
1217 rbd_destroy_ops(ops);
1218 return ret;
1219}
1220
1221static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1222{
1223 struct rbd_device *dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001224 int rc;
1225
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001226 if (!dev)
1227 return;
1228
1229 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1230 notify_id, (int)opcode);
1231 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08001232 rc = __rbd_refresh_header(dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001233 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001234 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001235 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
1236 " update snaps: %d\n", dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001237
1238 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1239}
1240
1241/*
1242 * Request sync osd watch
1243 */
1244static int rbd_req_sync_watch(struct rbd_device *dev,
1245 const char *obj,
1246 u64 ver)
1247{
1248 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001249 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001250
1251 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1252 if (ret < 0)
1253 return ret;
1254
1255 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1256 (void *)dev, &dev->watch_event);
1257 if (ret < 0)
1258 goto fail;
1259
1260 ops[0].watch.ver = cpu_to_le64(ver);
1261 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1262 ops[0].watch.flag = 1;
1263
1264 ret = rbd_req_sync_op(dev, NULL,
1265 CEPH_NOSNAP,
1266 0,
1267 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1268 ops,
1269 1, obj, 0, 0, NULL,
1270 &dev->watch_request, NULL);
1271
1272 if (ret < 0)
1273 goto fail_event;
1274
1275 rbd_destroy_ops(ops);
1276 return 0;
1277
1278fail_event:
1279 ceph_osdc_cancel_event(dev->watch_event);
1280 dev->watch_event = NULL;
1281fail:
1282 rbd_destroy_ops(ops);
1283 return ret;
1284}
1285
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001286/*
1287 * Request sync osd unwatch
1288 */
1289static int rbd_req_sync_unwatch(struct rbd_device *dev,
1290 const char *obj)
1291{
1292 struct ceph_osd_req_op *ops;
1293
1294 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1295 if (ret < 0)
1296 return ret;
1297
1298 ops[0].watch.ver = 0;
1299 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1300 ops[0].watch.flag = 0;
1301
1302 ret = rbd_req_sync_op(dev, NULL,
1303 CEPH_NOSNAP,
1304 0,
1305 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1306 ops,
1307 1, obj, 0, 0, NULL, NULL, NULL);
1308
1309 rbd_destroy_ops(ops);
1310 ceph_osdc_cancel_event(dev->watch_event);
1311 dev->watch_event = NULL;
1312 return ret;
1313}
1314
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001315struct rbd_notify_info {
1316 struct rbd_device *dev;
1317};
1318
1319static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1320{
1321 struct rbd_device *dev = (struct rbd_device *)data;
1322 if (!dev)
1323 return;
1324
1325 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1326 notify_id, (int)opcode);
1327}
1328
1329/*
1330 * Request sync osd notify
1331 */
1332static int rbd_req_sync_notify(struct rbd_device *dev,
1333 const char *obj)
1334{
1335 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001336 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001337 struct ceph_osd_event *event;
1338 struct rbd_notify_info info;
1339 int payload_len = sizeof(u32) + sizeof(u32);
1340 int ret;
1341
1342 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1343 if (ret < 0)
1344 return ret;
1345
1346 info.dev = dev;
1347
1348 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1349 (void *)&info, &event);
1350 if (ret < 0)
1351 goto fail;
1352
1353 ops[0].watch.ver = 1;
1354 ops[0].watch.flag = 1;
1355 ops[0].watch.cookie = event->cookie;
1356 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1357 ops[0].watch.timeout = 12;
1358
1359 ret = rbd_req_sync_op(dev, NULL,
1360 CEPH_NOSNAP,
1361 0,
1362 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1363 ops,
1364 1, obj, 0, 0, NULL, NULL, NULL);
1365 if (ret < 0)
1366 goto fail_event;
1367
1368 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1369 dout("ceph_osdc_wait_event returned %d\n", ret);
1370 rbd_destroy_ops(ops);
1371 return 0;
1372
1373fail_event:
1374 ceph_osdc_cancel_event(event);
1375fail:
1376 rbd_destroy_ops(ops);
1377 return ret;
1378}
1379
1380/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001381 * Request sync osd read
1382 */
1383static int rbd_req_sync_exec(struct rbd_device *dev,
1384 const char *obj,
1385 const char *cls,
1386 const char *method,
1387 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001388 int len,
1389 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001390{
1391 struct ceph_osd_req_op *ops;
1392 int cls_len = strlen(cls);
1393 int method_len = strlen(method);
1394 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1395 cls_len + method_len + len);
1396 if (ret < 0)
1397 return ret;
1398
1399 ops[0].cls.class_name = cls;
1400 ops[0].cls.class_len = (__u8)cls_len;
1401 ops[0].cls.method_name = method;
1402 ops[0].cls.method_len = (__u8)method_len;
1403 ops[0].cls.argc = 0;
1404 ops[0].cls.indata = data;
1405 ops[0].cls.indata_len = len;
1406
1407 ret = rbd_req_sync_op(dev, NULL,
1408 CEPH_NOSNAP,
1409 0,
1410 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1411 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001412 1, obj, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001413
1414 rbd_destroy_ops(ops);
1415
1416 dout("cls_exec returned %d\n", ret);
1417 return ret;
1418}
1419
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001420static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1421{
1422 struct rbd_req_coll *coll =
1423 kzalloc(sizeof(struct rbd_req_coll) +
1424 sizeof(struct rbd_req_status) * num_reqs,
1425 GFP_ATOMIC);
1426
1427 if (!coll)
1428 return NULL;
1429 coll->total = num_reqs;
1430 kref_init(&coll->kref);
1431 return coll;
1432}
1433
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001434/*
1435 * block device queue callback
1436 */
1437static void rbd_rq_fn(struct request_queue *q)
1438{
1439 struct rbd_device *rbd_dev = q->queuedata;
1440 struct request *rq;
1441 struct bio_pair *bp = NULL;
1442
Alex Elder00f1f362012-02-07 12:03:36 -06001443 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001444 struct bio *bio;
1445 struct bio *rq_bio, *next_bio = NULL;
1446 bool do_write;
1447 int size, op_size = 0;
1448 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001449 int num_segs, cur_seg = 0;
1450 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001451
1452 /* peek at request from block layer */
1453 if (!rq)
1454 break;
1455
1456 dout("fetched request\n");
1457
1458 /* filter out block requests we don't understand */
1459 if ((rq->cmd_type != REQ_TYPE_FS)) {
1460 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001461 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001462 }
1463
1464 /* deduce our operation (read, write) */
1465 do_write = (rq_data_dir(rq) == WRITE);
1466
1467 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001468 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001469 rq_bio = rq->bio;
1470 if (do_write && rbd_dev->read_only) {
1471 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001472 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473 }
1474
1475 spin_unlock_irq(q->queue_lock);
1476
1477 dout("%s 0x%x bytes at 0x%llx\n",
1478 do_write ? "write" : "read",
Alex Elder593a9e72012-02-07 12:03:37 -06001479 size, blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001481 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1482 coll = rbd_alloc_coll(num_segs);
1483 if (!coll) {
1484 spin_lock_irq(q->queue_lock);
1485 __blk_end_request_all(rq, -ENOMEM);
Alex Elder00f1f362012-02-07 12:03:36 -06001486 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001487 }
1488
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001489 do {
1490 /* a bio clone to be passed down to OSD req */
1491 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1492 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001493 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001494 ofs, size,
1495 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001496 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001497 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1498 op_size, GFP_ATOMIC);
1499 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001500 rbd_coll_end_req_index(rq, coll, cur_seg,
1501 -ENOMEM, op_size);
1502 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503 }
1504
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001505
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001506 /* init OSD command: write or read */
1507 if (do_write)
1508 rbd_req_write(rq, rbd_dev,
1509 rbd_dev->header.snapc,
1510 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001511 op_size, bio,
1512 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513 else
1514 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001515 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001516 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001517 op_size, bio,
1518 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001519
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001520next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521 size -= op_size;
1522 ofs += op_size;
1523
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001524 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 rq_bio = next_bio;
1526 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001527 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528
1529 if (bp)
1530 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001531 spin_lock_irq(q->queue_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001532 }
1533}
1534
1535/*
1536 * a queue callback. Makes sure that we don't create a bio that spans across
1537 * multiple osd objects. One exception would be with a single page bios,
1538 * which we handle later at bio_chain_clone
1539 */
1540static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1541 struct bio_vec *bvec)
1542{
1543 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001544 unsigned int chunk_sectors;
1545 sector_t sector;
1546 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001547 int max;
1548
Alex Elder593a9e72012-02-07 12:03:37 -06001549 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1550 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1551 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1552
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001554 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555 if (max < 0)
1556 max = 0; /* bio_add cannot handle a negative return */
1557 if (max <= bvec->bv_len && bio_sectors == 0)
1558 return bvec->bv_len;
1559 return max;
1560}
1561
1562static void rbd_free_disk(struct rbd_device *rbd_dev)
1563{
1564 struct gendisk *disk = rbd_dev->disk;
1565
1566 if (!disk)
1567 return;
1568
1569 rbd_header_free(&rbd_dev->header);
1570
1571 if (disk->flags & GENHD_FL_UP)
1572 del_gendisk(disk);
1573 if (disk->queue)
1574 blk_cleanup_queue(disk->queue);
1575 put_disk(disk);
1576}
1577
1578/*
1579 * reload the ondisk the header
1580 */
1581static int rbd_read_header(struct rbd_device *rbd_dev,
1582 struct rbd_image_header *header)
1583{
1584 ssize_t rc;
1585 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001586 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001587 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001588 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001589
Alex Elder00f1f362012-02-07 12:03:36 -06001590 /*
1591 * First reads the fixed-size header to determine the number
1592 * of snapshots, then re-reads it, along with all snapshot
1593 * records as well as their stored names.
1594 */
1595 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001596 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001597 dh = kmalloc(len, GFP_KERNEL);
1598 if (!dh)
1599 return -ENOMEM;
1600
1601 rc = rbd_req_sync_read(rbd_dev,
1602 NULL, CEPH_NOSNAP,
1603 rbd_dev->obj_md_name,
1604 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001605 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001606 if (rc < 0)
1607 goto out_dh;
1608
1609 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001610 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001611 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001612 pr_warning("unrecognized header format"
1613 " for image %s", rbd_dev->obj);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001614 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001615 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001616
Alex Elder00f1f362012-02-07 12:03:36 -06001617 if (snap_count == header->total_snaps)
1618 break;
1619
1620 snap_count = header->total_snaps;
1621 len = sizeof (*dh) +
1622 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1623 header->snap_names_len;
1624
1625 rbd_header_free(header);
1626 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001627 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001628 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001629
1630out_dh:
1631 kfree(dh);
1632 return rc;
1633}
1634
1635/*
1636 * create a snapshot
1637 */
1638static int rbd_header_add_snap(struct rbd_device *dev,
1639 const char *snap_name,
1640 gfp_t gfp_flags)
1641{
1642 int name_len = strlen(snap_name);
1643 u64 new_snapid;
1644 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001645 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001646 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001647 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001648
1649 /* we should create a snapshot only if we're pointing at the head */
Josh Durgin77dfe992011-11-21 13:04:42 -08001650 if (dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001651 return -EINVAL;
1652
Alex Elder1dbb4392012-01-24 10:08:37 -06001653 monc = &dev->rbd_client->client->monc;
Alex Elder9bb2f332012-07-12 10:46:35 -05001654 ret = ceph_monc_create_snapid(monc, dev->pool_id, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001655 dout("created snapid=%lld\n", new_snapid);
1656 if (ret < 0)
1657 return ret;
1658
1659 data = kmalloc(name_len + 16, gfp_flags);
1660 if (!data)
1661 return -ENOMEM;
1662
Sage Weil916d4d62011-05-12 16:10:50 -07001663 p = data;
1664 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001665
Sage Weil916d4d62011-05-12 16:10:50 -07001666 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1667 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001668
1669 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001670 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001671
Sage Weil916d4d62011-05-12 16:10:50 -07001672 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001673
1674 if (ret < 0)
1675 return ret;
1676
Josh Durgin403f24d2011-12-05 10:47:13 -08001677 down_write(&dev->header_rwsem);
1678 dev->header.snapc->seq = new_snapid;
1679 up_write(&dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001680
1681 return 0;
1682bad:
1683 return -ERANGE;
1684}
1685
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001686static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1687{
1688 struct rbd_snap *snap;
1689
1690 while (!list_empty(&rbd_dev->snaps)) {
1691 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1692 __rbd_remove_snap_dev(rbd_dev, snap);
1693 }
1694}
1695
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001696/*
1697 * only read the first part of the ondisk header, without the snaps info
1698 */
Josh Durgin263c6ca2011-12-05 10:43:42 -08001699static int __rbd_refresh_header(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001700{
1701 int ret;
1702 struct rbd_image_header h;
1703 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001704 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001705
1706 ret = rbd_read_header(rbd_dev, &h);
1707 if (ret < 0)
1708 return ret;
1709
Sage Weil9db4b3e2011-04-19 22:49:06 -07001710 /* resized? */
Alex Elder593a9e72012-02-07 12:03:37 -06001711 set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001712
Josh Durginc6666012011-11-21 17:11:12 -08001713 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001714
1715 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001716 if (rbd_dev->header.total_snaps &&
1717 rbd_dev->header.snapc->snaps[0] == snap_seq)
1718 /* pointing at the head, will need to follow that
1719 if head moves */
1720 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001721
Alex Elder849b4262012-07-09 21:04:24 -05001722 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001723 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001724 kfree(rbd_dev->header.snap_names);
1725 kfree(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001726
1727 rbd_dev->header.total_snaps = h.total_snaps;
1728 rbd_dev->header.snapc = h.snapc;
1729 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001730 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001731 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001732 /* Free the extra copy of the object prefix */
1733 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1734 kfree(h.object_prefix);
1735
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001736 if (follow_seq)
1737 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1738 else
1739 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001740
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001741 ret = __rbd_init_snaps_header(rbd_dev);
1742
Josh Durginc6666012011-11-21 17:11:12 -08001743 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001745 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746}
1747
1748static int rbd_init_disk(struct rbd_device *rbd_dev)
1749{
1750 struct gendisk *disk;
1751 struct request_queue *q;
1752 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001753 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001754 u64 total_size = 0;
1755
1756 /* contact OSD, request size info about the object being mapped */
1757 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1758 if (rc)
1759 return rc;
1760
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001761 /* no need to lock here, as rbd_dev is not registered yet */
1762 rc = __rbd_init_snaps_header(rbd_dev);
1763 if (rc)
1764 return rc;
1765
Josh Durgincc9d7342011-11-21 18:19:13 -08001766 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001767 if (rc)
1768 return rc;
1769
1770 /* create gendisk info */
1771 rc = -ENOMEM;
1772 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1773 if (!disk)
1774 goto out;
1775
Alex Elderf0f8cef2012-01-29 13:57:44 -06001776 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001777 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001778 disk->major = rbd_dev->major;
1779 disk->first_minor = 0;
1780 disk->fops = &rbd_bd_ops;
1781 disk->private_data = rbd_dev;
1782
1783 /* init rq */
1784 rc = -ENOMEM;
1785 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1786 if (!q)
1787 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001788
Alex Elder593a9e72012-02-07 12:03:37 -06001789 /* We use the default size, but let's be explicit about it. */
1790 blk_queue_physical_block_size(q, SECTOR_SIZE);
1791
Josh Durgin029bcbd2011-07-22 11:35:23 -07001792 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001793 segment_size = rbd_obj_bytes(&rbd_dev->header);
1794 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1795 blk_queue_max_segment_size(q, segment_size);
1796 blk_queue_io_min(q, segment_size);
1797 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001798
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799 blk_queue_merge_bvec(q, rbd_merge_bvec);
1800 disk->queue = q;
1801
1802 q->queuedata = rbd_dev;
1803
1804 rbd_dev->disk = disk;
1805 rbd_dev->q = q;
1806
1807 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001808 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001809 add_disk(disk);
1810
1811 pr_info("%s: added with size 0x%llx\n",
1812 disk->disk_name, (unsigned long long)total_size);
1813 return 0;
1814
1815out_disk:
1816 put_disk(disk);
1817out:
1818 return rc;
1819}
1820
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001821/*
1822 sysfs
1823*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001824
Alex Elder593a9e72012-02-07 12:03:37 -06001825static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1826{
1827 return container_of(dev, struct rbd_device, dev);
1828}
1829
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001830static ssize_t rbd_size_show(struct device *dev,
1831 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832{
Alex Elder593a9e72012-02-07 12:03:37 -06001833 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001834
1835 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001836}
1837
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001838static ssize_t rbd_major_show(struct device *dev,
1839 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840{
Alex Elder593a9e72012-02-07 12:03:37 -06001841 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001842
1843 return sprintf(buf, "%d\n", rbd_dev->major);
1844}
1845
1846static ssize_t rbd_client_id_show(struct device *dev,
1847 struct device_attribute *attr, char *buf)
1848{
Alex Elder593a9e72012-02-07 12:03:37 -06001849 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001850
Alex Elder1dbb4392012-01-24 10:08:37 -06001851 return sprintf(buf, "client%lld\n",
1852 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001853}
1854
1855static ssize_t rbd_pool_show(struct device *dev,
1856 struct device_attribute *attr, char *buf)
1857{
Alex Elder593a9e72012-02-07 12:03:37 -06001858 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001859
1860 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1861}
1862
Alex Elder9bb2f332012-07-12 10:46:35 -05001863static ssize_t rbd_pool_id_show(struct device *dev,
1864 struct device_attribute *attr, char *buf)
1865{
1866 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1867
1868 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1869}
1870
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001871static ssize_t rbd_name_show(struct device *dev,
1872 struct device_attribute *attr, char *buf)
1873{
Alex Elder593a9e72012-02-07 12:03:37 -06001874 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001875
1876 return sprintf(buf, "%s\n", rbd_dev->obj);
1877}
1878
1879static ssize_t rbd_snap_show(struct device *dev,
1880 struct device_attribute *attr,
1881 char *buf)
1882{
Alex Elder593a9e72012-02-07 12:03:37 -06001883 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001884
1885 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1886}
1887
1888static ssize_t rbd_image_refresh(struct device *dev,
1889 struct device_attribute *attr,
1890 const char *buf,
1891 size_t size)
1892{
Alex Elder593a9e72012-02-07 12:03:37 -06001893 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001894 int rc;
1895 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001896
1897 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1898
Josh Durgin263c6ca2011-12-05 10:43:42 -08001899 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001900 if (rc < 0)
1901 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001902
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001903 mutex_unlock(&ctl_mutex);
1904 return ret;
1905}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001906
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001907static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1908static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1909static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1910static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001911static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001912static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1913static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1914static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1915static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001916
1917static struct attribute *rbd_attrs[] = {
1918 &dev_attr_size.attr,
1919 &dev_attr_major.attr,
1920 &dev_attr_client_id.attr,
1921 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001922 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001923 &dev_attr_name.attr,
1924 &dev_attr_current_snap.attr,
1925 &dev_attr_refresh.attr,
1926 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001927 NULL
1928};
1929
1930static struct attribute_group rbd_attr_group = {
1931 .attrs = rbd_attrs,
1932};
1933
1934static const struct attribute_group *rbd_attr_groups[] = {
1935 &rbd_attr_group,
1936 NULL
1937};
1938
1939static void rbd_sysfs_dev_release(struct device *dev)
1940{
1941}
1942
1943static struct device_type rbd_device_type = {
1944 .name = "rbd",
1945 .groups = rbd_attr_groups,
1946 .release = rbd_sysfs_dev_release,
1947};
1948
1949
1950/*
1951 sysfs - snapshots
1952*/
1953
1954static ssize_t rbd_snap_size_show(struct device *dev,
1955 struct device_attribute *attr,
1956 char *buf)
1957{
1958 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1959
Josh Durgin35915382011-12-05 18:25:13 -08001960 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001961}
1962
1963static ssize_t rbd_snap_id_show(struct device *dev,
1964 struct device_attribute *attr,
1965 char *buf)
1966{
1967 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1968
Josh Durgin35915382011-12-05 18:25:13 -08001969 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001970}
1971
1972static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1973static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1974
1975static struct attribute *rbd_snap_attrs[] = {
1976 &dev_attr_snap_size.attr,
1977 &dev_attr_snap_id.attr,
1978 NULL,
1979};
1980
1981static struct attribute_group rbd_snap_attr_group = {
1982 .attrs = rbd_snap_attrs,
1983};
1984
1985static void rbd_snap_dev_release(struct device *dev)
1986{
1987 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1988 kfree(snap->name);
1989 kfree(snap);
1990}
1991
1992static const struct attribute_group *rbd_snap_attr_groups[] = {
1993 &rbd_snap_attr_group,
1994 NULL
1995};
1996
1997static struct device_type rbd_snap_device_type = {
1998 .groups = rbd_snap_attr_groups,
1999 .release = rbd_snap_dev_release,
2000};
2001
2002static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
2003 struct rbd_snap *snap)
2004{
2005 list_del(&snap->node);
2006 device_unregister(&snap->dev);
2007}
2008
2009static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2010 struct rbd_snap *snap,
2011 struct device *parent)
2012{
2013 struct device *dev = &snap->dev;
2014 int ret;
2015
2016 dev->type = &rbd_snap_device_type;
2017 dev->parent = parent;
2018 dev->release = rbd_snap_dev_release;
2019 dev_set_name(dev, "snap_%s", snap->name);
2020 ret = device_register(dev);
2021
2022 return ret;
2023}
2024
2025static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2026 int i, const char *name,
2027 struct rbd_snap **snapp)
2028{
2029 int ret;
2030 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2031 if (!snap)
2032 return -ENOMEM;
2033 snap->name = kstrdup(name, GFP_KERNEL);
2034 snap->size = rbd_dev->header.snap_sizes[i];
2035 snap->id = rbd_dev->header.snapc->snaps[i];
2036 if (device_is_registered(&rbd_dev->dev)) {
2037 ret = rbd_register_snap_dev(rbd_dev, snap,
2038 &rbd_dev->dev);
2039 if (ret < 0)
2040 goto err;
2041 }
2042 *snapp = snap;
2043 return 0;
2044err:
2045 kfree(snap->name);
2046 kfree(snap);
2047 return ret;
2048}
2049
2050/*
2051 * search for the previous snap in a null delimited string list
2052 */
2053const char *rbd_prev_snap_name(const char *name, const char *start)
2054{
2055 if (name < start + 2)
2056 return NULL;
2057
2058 name -= 2;
2059 while (*name) {
2060 if (name == start)
2061 return start;
2062 name--;
2063 }
2064 return name + 1;
2065}
2066
2067/*
2068 * compare the old list of snapshots that we have to what's in the header
2069 * and update it accordingly. Note that the header holds the snapshots
2070 * in a reverse order (from newest to oldest) and we need to go from
2071 * older to new so that we don't get a duplicate snap name when
2072 * doing the process (e.g., removed snapshot and recreated a new
2073 * one with the same name.
2074 */
2075static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2076{
2077 const char *name, *first_name;
2078 int i = rbd_dev->header.total_snaps;
2079 struct rbd_snap *snap, *old_snap = NULL;
2080 int ret;
2081 struct list_head *p, *n;
2082
2083 first_name = rbd_dev->header.snap_names;
2084 name = first_name + rbd_dev->header.snap_names_len;
2085
2086 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2087 u64 cur_id;
2088
2089 old_snap = list_entry(p, struct rbd_snap, node);
2090
2091 if (i)
2092 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2093
2094 if (!i || old_snap->id < cur_id) {
2095 /* old_snap->id was skipped, thus was removed */
2096 __rbd_remove_snap_dev(rbd_dev, old_snap);
2097 continue;
2098 }
2099 if (old_snap->id == cur_id) {
2100 /* we have this snapshot already */
2101 i--;
2102 name = rbd_prev_snap_name(name, first_name);
2103 continue;
2104 }
2105 for (; i > 0;
2106 i--, name = rbd_prev_snap_name(name, first_name)) {
2107 if (!name) {
2108 WARN_ON(1);
2109 return -EINVAL;
2110 }
2111 cur_id = rbd_dev->header.snapc->snaps[i];
2112 /* snapshot removal? handle it above */
2113 if (cur_id >= old_snap->id)
2114 break;
2115 /* a new snapshot */
2116 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2117 if (ret < 0)
2118 return ret;
2119
2120 /* note that we add it backward so using n and not p */
2121 list_add(&snap->node, n);
2122 p = &snap->node;
2123 }
2124 }
2125 /* we're done going over the old snap list, just add what's left */
2126 for (; i > 0; i--) {
2127 name = rbd_prev_snap_name(name, first_name);
2128 if (!name) {
2129 WARN_ON(1);
2130 return -EINVAL;
2131 }
2132 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2133 if (ret < 0)
2134 return ret;
2135 list_add(&snap->node, &rbd_dev->snaps);
2136 }
2137
2138 return 0;
2139}
2140
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002141static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2142{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002143 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002144 struct device *dev;
2145 struct rbd_snap *snap;
2146
2147 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2148 dev = &rbd_dev->dev;
2149
2150 dev->bus = &rbd_bus_type;
2151 dev->type = &rbd_device_type;
2152 dev->parent = &rbd_root_dev;
2153 dev->release = rbd_dev_release;
2154 dev_set_name(dev, "%d", rbd_dev->id);
2155 ret = device_register(dev);
2156 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002157 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002158
2159 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2160 ret = rbd_register_snap_dev(rbd_dev, snap,
2161 &rbd_dev->dev);
2162 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002163 break;
2164 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002165out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166 mutex_unlock(&ctl_mutex);
2167 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002168}
2169
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002170static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2171{
2172 device_unregister(&rbd_dev->dev);
2173}
2174
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002175static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2176{
2177 int ret, rc;
2178
2179 do {
2180 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2181 rbd_dev->header.obj_version);
2182 if (ret == -ERANGE) {
2183 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08002184 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002185 mutex_unlock(&ctl_mutex);
2186 if (rc < 0)
2187 return rc;
2188 }
2189 } while (ret == -ERANGE);
2190
2191 return ret;
2192}
2193
Alex Elder1ddbe942012-01-29 13:57:44 -06002194static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2195
2196/*
Alex Elder499afd52012-02-02 08:13:29 -06002197 * Get a unique rbd identifier for the given new rbd_dev, and add
2198 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002199 */
Alex Elder499afd52012-02-02 08:13:29 -06002200static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002201{
Alex Elder499afd52012-02-02 08:13:29 -06002202 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2203
2204 spin_lock(&rbd_dev_list_lock);
2205 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2206 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002207}
Alex Elderb7f23c32012-01-29 13:57:43 -06002208
Alex Elder1ddbe942012-01-29 13:57:44 -06002209/*
Alex Elder499afd52012-02-02 08:13:29 -06002210 * Remove an rbd_dev from the global list, and record that its
2211 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002212 */
Alex Elder499afd52012-02-02 08:13:29 -06002213static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002214{
Alex Elderd184f6b2012-01-29 13:57:44 -06002215 struct list_head *tmp;
2216 int rbd_id = rbd_dev->id;
2217 int max_id;
2218
2219 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002220
2221 spin_lock(&rbd_dev_list_lock);
2222 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002223
2224 /*
2225 * If the id being "put" is not the current maximum, there
2226 * is nothing special we need to do.
2227 */
2228 if (rbd_id != atomic64_read(&rbd_id_max)) {
2229 spin_unlock(&rbd_dev_list_lock);
2230 return;
2231 }
2232
2233 /*
2234 * We need to update the current maximum id. Search the
2235 * list to find out what it is. We're more likely to find
2236 * the maximum at the end, so search the list backward.
2237 */
2238 max_id = 0;
2239 list_for_each_prev(tmp, &rbd_dev_list) {
2240 struct rbd_device *rbd_dev;
2241
2242 rbd_dev = list_entry(tmp, struct rbd_device, node);
2243 if (rbd_id > max_id)
2244 max_id = rbd_id;
2245 }
Alex Elder499afd52012-02-02 08:13:29 -06002246 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002247
Alex Elder1ddbe942012-01-29 13:57:44 -06002248 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002249 * The max id could have been updated by rbd_id_get(), in
2250 * which case it now accurately reflects the new maximum.
2251 * Be careful not to overwrite the maximum value in that
2252 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002253 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002254 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002255}
2256
Alex Eldera725f65e2012-02-02 08:13:30 -06002257/*
Alex Eldere28fff262012-02-02 08:13:30 -06002258 * Skips over white space at *buf, and updates *buf to point to the
2259 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002260 * the token (string of non-white space characters) found. Note
2261 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002262 */
2263static inline size_t next_token(const char **buf)
2264{
2265 /*
2266 * These are the characters that produce nonzero for
2267 * isspace() in the "C" and "POSIX" locales.
2268 */
2269 const char *spaces = " \f\n\r\t\v";
2270
2271 *buf += strspn(*buf, spaces); /* Find start of token */
2272
2273 return strcspn(*buf, spaces); /* Return token length */
2274}
2275
2276/*
2277 * Finds the next token in *buf, and if the provided token buffer is
2278 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002279 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2280 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002281 *
2282 * Returns the length of the token found (not including the '\0').
2283 * Return value will be 0 if no token is found, and it will be >=
2284 * token_size if the token would not fit.
2285 *
Alex Elder593a9e72012-02-07 12:03:37 -06002286 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002287 * found token. Note that this occurs even if the token buffer is
2288 * too small to hold it.
2289 */
2290static inline size_t copy_token(const char **buf,
2291 char *token,
2292 size_t token_size)
2293{
2294 size_t len;
2295
2296 len = next_token(buf);
2297 if (len < token_size) {
2298 memcpy(token, *buf, len);
2299 *(token + len) = '\0';
2300 }
2301 *buf += len;
2302
2303 return len;
2304}
2305
2306/*
Alex Elderea3352f2012-07-09 21:04:23 -05002307 * Finds the next token in *buf, dynamically allocates a buffer big
2308 * enough to hold a copy of it, and copies the token into the new
2309 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2310 * that a duplicate buffer is created even for a zero-length token.
2311 *
2312 * Returns a pointer to the newly-allocated duplicate, or a null
2313 * pointer if memory for the duplicate was not available. If
2314 * the lenp argument is a non-null pointer, the length of the token
2315 * (not including the '\0') is returned in *lenp.
2316 *
2317 * If successful, the *buf pointer will be updated to point beyond
2318 * the end of the found token.
2319 *
2320 * Note: uses GFP_KERNEL for allocation.
2321 */
2322static inline char *dup_token(const char **buf, size_t *lenp)
2323{
2324 char *dup;
2325 size_t len;
2326
2327 len = next_token(buf);
2328 dup = kmalloc(len + 1, GFP_KERNEL);
2329 if (!dup)
2330 return NULL;
2331
2332 memcpy(dup, *buf, len);
2333 *(dup + len) = '\0';
2334 *buf += len;
2335
2336 if (lenp)
2337 *lenp = len;
2338
2339 return dup;
2340}
2341
2342/*
Alex Eldera725f65e2012-02-02 08:13:30 -06002343 * This fills in the pool_name, obj, obj_len, snap_name, obj_len,
2344 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2345 * on the list of monitor addresses and other options provided via
2346 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002347 *
2348 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002349 */
2350static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2351 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002352 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002353 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002354 char *options,
2355 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002356{
Alex Elderd22f76e2012-07-12 10:46:35 -05002357 size_t len;
2358 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002359
2360 /* The first four tokens are required */
2361
Alex Elder7ef32142012-02-02 08:13:30 -06002362 len = next_token(&buf);
2363 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002364 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002365 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002366 *mon_addrs = buf;
2367
2368 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002369
Alex Eldere28fff262012-02-02 08:13:30 -06002370 len = copy_token(&buf, options, options_size);
2371 if (!len || len >= options_size)
2372 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002373
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002374 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002375 rbd_dev->pool_name = dup_token(&buf, NULL);
2376 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002377 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002378
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002379 rbd_dev->obj = dup_token(&buf, &rbd_dev->obj_len);
2380 if (!rbd_dev->obj)
2381 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002382
Alex Eldercb8627c2012-07-09 21:04:23 -05002383 /* Create the name of the header object */
2384
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002385 rbd_dev->obj_md_name = kmalloc(rbd_dev->obj_len
2386 + sizeof (RBD_SUFFIX),
2387 GFP_KERNEL);
2388 if (!rbd_dev->obj_md_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002389 goto out_err;
Alex Elder81a89792012-02-02 08:13:30 -06002390 sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002391
Alex Eldere28fff262012-02-02 08:13:30 -06002392 /*
2393 * The snapshot name is optional, but it's an error if it's
2394 * too long. If no snapshot is supplied, fill in the default.
2395 */
2396 len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name));
2397 if (!len)
2398 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2399 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002400 else if (len >= sizeof (rbd_dev->snap_name)) {
2401 ret = -EINVAL;
Alex Elderd22f76e2012-07-12 10:46:35 -05002402 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05002403 }
Alex Eldere28fff262012-02-02 08:13:30 -06002404
Alex Eldera725f65e2012-02-02 08:13:30 -06002405 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002406
2407out_err:
Alex Eldercb8627c2012-07-09 21:04:23 -05002408 kfree(rbd_dev->obj_md_name);
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002409 kfree(rbd_dev->obj);
Alex Elderd22f76e2012-07-12 10:46:35 -05002410 kfree(rbd_dev->pool_name);
2411 rbd_dev->pool_name = NULL;
2412
2413 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002414}
2415
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002416static ssize_t rbd_add(struct bus_type *bus,
2417 const char *buf,
2418 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002419{
Alex Eldercb8627c2012-07-09 21:04:23 -05002420 char *options;
2421 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002422 const char *mon_addrs = NULL;
2423 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002424 struct ceph_osd_client *osdc;
2425 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002426
2427 if (!try_module_get(THIS_MODULE))
2428 return -ENODEV;
2429
Alex Elder27cc2592012-02-02 08:13:30 -06002430 options = kmalloc(count, GFP_KERNEL);
2431 if (!options)
2432 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002433 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2434 if (!rbd_dev)
2435 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002436
2437 /* static rbd_device initialization */
2438 spin_lock_init(&rbd_dev->lock);
2439 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002440 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002441 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002442
Josh Durginc6666012011-11-21 17:11:12 -08002443 init_rwsem(&rbd_dev->header_rwsem);
Alex Elder0e805a12012-01-11 19:42:15 -08002444
Alex Elderd184f6b2012-01-29 13:57:44 -06002445 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002446 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002447
Alex Eldera725f65e2012-02-02 08:13:30 -06002448 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002449 BUILD_BUG_ON(DEV_NAME_LEN
2450 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2451 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
Alex Eldere124a822012-01-29 13:57:44 -06002452
Alex Eldera725f65e2012-02-02 08:13:30 -06002453 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002454 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002455 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002456 if (rc)
2457 goto err_put_id;
2458
Alex Elder5214ecc2012-02-02 08:13:30 -06002459 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2460 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002461 if (IS_ERR(rbd_dev->rbd_client)) {
2462 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002463 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002464 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002465
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002466 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002467 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002468 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2469 if (rc < 0)
2470 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002471 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002472
2473 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002474 rc = register_blkdev(0, rbd_dev->name);
2475 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002476 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002477 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002478
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002479 rc = rbd_bus_add_dev(rbd_dev);
2480 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002481 goto err_out_blkdev;
2482
Alex Elder32eec682012-02-08 16:11:14 -06002483 /*
2484 * At this point cleanup in the event of an error is the job
2485 * of the sysfs code (initiated by rbd_bus_del_dev()).
2486 *
2487 * Set up and announce blkdev mapping.
2488 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002489 rc = rbd_init_disk(rbd_dev);
2490 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002491 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002492
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002493 rc = rbd_init_watch_dev(rbd_dev);
2494 if (rc)
2495 goto err_out_bus;
2496
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002497 return count;
2498
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002499err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002500 /* this will also clean up rest of rbd_dev stuff */
2501
2502 rbd_bus_del_dev(rbd_dev);
2503 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002504 return rc;
2505
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002506err_out_blkdev:
2507 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2508err_out_client:
2509 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002510err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002511 if (rbd_dev->pool_name) {
2512 kfree(rbd_dev->obj_md_name);
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002513 kfree(rbd_dev->obj);
Alex Eldercb8627c2012-07-09 21:04:23 -05002514 kfree(rbd_dev->pool_name);
2515 }
Alex Elder499afd52012-02-02 08:13:29 -06002516 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002517err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002518 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002519 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002520
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002521 dout("Error adding device %s\n", buf);
2522 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002523
2524 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002525}
2526
2527static struct rbd_device *__rbd_get_dev(unsigned long id)
2528{
2529 struct list_head *tmp;
2530 struct rbd_device *rbd_dev;
2531
Alex Eldere124a822012-01-29 13:57:44 -06002532 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002533 list_for_each(tmp, &rbd_dev_list) {
2534 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a822012-01-29 13:57:44 -06002535 if (rbd_dev->id == id) {
2536 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002537 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002538 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002539 }
Alex Eldere124a822012-01-29 13:57:44 -06002540 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002541 return NULL;
2542}
2543
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002544static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002545{
Alex Elder593a9e72012-02-07 12:03:37 -06002546 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002547
Alex Elder1dbb4392012-01-24 10:08:37 -06002548 if (rbd_dev->watch_request) {
2549 struct ceph_client *client = rbd_dev->rbd_client->client;
2550
2551 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002552 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002553 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002554 if (rbd_dev->watch_event)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07002555 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002556
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002557 rbd_put_client(rbd_dev);
2558
2559 /* clean up and free blkdev */
2560 rbd_free_disk(rbd_dev);
2561 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002562
2563 /* done with the id, and with the rbd_dev */
Alex Eldercb8627c2012-07-09 21:04:23 -05002564 kfree(rbd_dev->obj_md_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002565 kfree(rbd_dev->pool_name);
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002566 kfree(rbd_dev->obj);
Alex Elder32eec682012-02-08 16:11:14 -06002567 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002568 kfree(rbd_dev);
2569
2570 /* release module ref */
2571 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002572}
2573
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002574static ssize_t rbd_remove(struct bus_type *bus,
2575 const char *buf,
2576 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002577{
2578 struct rbd_device *rbd_dev = NULL;
2579 int target_id, rc;
2580 unsigned long ul;
2581 int ret = count;
2582
2583 rc = strict_strtoul(buf, 10, &ul);
2584 if (rc)
2585 return rc;
2586
2587 /* convert to int; abort if we lost anything in the conversion */
2588 target_id = (int) ul;
2589 if (target_id != ul)
2590 return -EINVAL;
2591
2592 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2593
2594 rbd_dev = __rbd_get_dev(target_id);
2595 if (!rbd_dev) {
2596 ret = -ENOENT;
2597 goto done;
2598 }
2599
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002600 __rbd_remove_all_snaps(rbd_dev);
2601 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002602
2603done:
2604 mutex_unlock(&ctl_mutex);
2605 return ret;
2606}
2607
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002608static ssize_t rbd_snap_add(struct device *dev,
2609 struct device_attribute *attr,
2610 const char *buf,
2611 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002612{
Alex Elder593a9e72012-02-07 12:03:37 -06002613 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002614 int ret;
2615 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002616 if (!name)
2617 return -ENOMEM;
2618
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002619 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002620
2621 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2622
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002623 ret = rbd_header_add_snap(rbd_dev,
2624 name, GFP_KERNEL);
2625 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002626 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002627
Josh Durgin263c6ca2011-12-05 10:43:42 -08002628 ret = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002629 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002630 goto err_unlock;
2631
2632 /* shouldn't hold ctl_mutex when notifying.. notify might
2633 trigger a watch callback that would need to get that mutex */
2634 mutex_unlock(&ctl_mutex);
2635
2636 /* make a best effort, don't error if failed */
2637 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002638
2639 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002640 kfree(name);
2641 return ret;
2642
2643err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002644 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002645 kfree(name);
2646 return ret;
2647}
2648
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002649/*
2650 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002651 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002652 */
2653static int rbd_sysfs_init(void)
2654{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002655 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002656
Alex Elderfed4c142012-02-07 12:03:36 -06002657 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002658 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002659 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002660
Alex Elderfed4c142012-02-07 12:03:36 -06002661 ret = bus_register(&rbd_bus_type);
2662 if (ret < 0)
2663 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002664
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002665 return ret;
2666}
2667
2668static void rbd_sysfs_cleanup(void)
2669{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002670 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002671 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002672}
2673
2674int __init rbd_init(void)
2675{
2676 int rc;
2677
2678 rc = rbd_sysfs_init();
2679 if (rc)
2680 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002681 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002682 return 0;
2683}
2684
2685void __exit rbd_exit(void)
2686{
2687 rbd_sysfs_cleanup();
2688}
2689
2690module_init(rbd_init);
2691module_exit(rbd_exit);
2692
2693MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2694MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2695MODULE_DESCRIPTION("rados block device");
2696
2697/* following authorship retained from original osdblk.c */
2698MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2699
2700MODULE_LICENSE("GPL");