blob: 2fe160014f58a40b1036905fe4b941bbaf027f7a [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
85 u64 snap_seq;
86 u32 total_snaps;
87
88 char *snap_names;
89 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090
91 u64 obj_version;
92};
93
94struct rbd_options {
95 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096};
97
98/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060099 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700100 */
101struct rbd_client {
102 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700103 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104 struct kref kref;
105 struct list_head node;
106};
107
108/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600109 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700110 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700111struct rbd_req_status {
112 int done;
113 int rc;
114 u64 bytes;
115};
116
117/*
118 * a collection of requests
119 */
120struct rbd_req_coll {
121 int total;
122 int num_done;
123 struct kref kref;
124 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700125};
126
Alex Elderf0f8cef2012-01-29 13:57:44 -0600127/*
128 * a single io request
129 */
130struct rbd_request {
131 struct request *rq; /* blk layer request */
132 struct bio *bio; /* cloned bio */
133 struct page **pages; /* list of used pages */
134 u64 len;
135 int coll_index;
136 struct rbd_req_coll *coll;
137};
138
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800139struct rbd_snap {
140 struct device dev;
141 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800142 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800143 struct list_head node;
144 u64 id;
145};
146
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700147/*
148 * a single device
149 */
150struct rbd_device {
151 int id; /* blkdev unique id */
152
153 int major; /* blkdev assigned major */
154 struct gendisk *disk; /* blkdev's gendisk and rq */
155 struct request_queue *q;
156
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157 struct rbd_client *rbd_client;
158
159 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
160
161 spinlock_t lock; /* queue lock */
162
163 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500164 char *image_name;
165 size_t image_name_len;
166 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500167 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500168 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700170 struct ceph_osd_event *watch_event;
171 struct ceph_osd_request *watch_request;
172
Josh Durginc6666012011-11-21 17:11:12 -0800173 /* protects updating the header */
174 struct rw_semaphore header_rwsem;
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgin77dfe992011-11-21 13:04:42 -0800176 u64 snap_id; /* current snapshot id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700177 int read_only;
178
179 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800180
181 /* list of snapshots */
182 struct list_head snaps;
183
184 /* sysfs related */
185 struct device dev;
186};
187
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600189
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700190static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600191static DEFINE_SPINLOCK(rbd_dev_list_lock);
192
Alex Elder432b8582012-01-29 13:57:44 -0600193static LIST_HEAD(rbd_client_list); /* clients */
194static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700195
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800196static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
197static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800198static ssize_t rbd_snap_add(struct device *dev,
199 struct device_attribute *attr,
200 const char *buf,
201 size_t count);
202static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700203 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800204
Alex Elderf0f8cef2012-01-29 13:57:44 -0600205static ssize_t rbd_add(struct bus_type *bus, const char *buf,
206 size_t count);
207static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
208 size_t count);
209
210static struct bus_attribute rbd_bus_attrs[] = {
211 __ATTR(add, S_IWUSR, NULL, rbd_add),
212 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
213 __ATTR_NULL
214};
215
216static struct bus_type rbd_bus_type = {
217 .name = "rbd",
218 .bus_attrs = rbd_bus_attrs,
219};
220
221static void rbd_root_dev_release(struct device *dev)
222{
223}
224
225static struct device rbd_root_dev = {
226 .init_name = "rbd",
227 .release = rbd_root_dev_release,
228};
229
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
232{
233 return get_device(&rbd_dev->dev);
234}
235
236static void rbd_put_dev(struct rbd_device *rbd_dev)
237{
238 put_device(&rbd_dev->dev);
239}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700240
Josh Durgin263c6ca2011-12-05 10:43:42 -0800241static int __rbd_refresh_header(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700242
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700243static int rbd_open(struct block_device *bdev, fmode_t mode)
244{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600245 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700246
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800247 rbd_get_dev(rbd_dev);
248
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249 set_device_ro(bdev, rbd_dev->read_only);
250
251 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
252 return -EROFS;
253
254 return 0;
255}
256
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800257static int rbd_release(struct gendisk *disk, fmode_t mode)
258{
259 struct rbd_device *rbd_dev = disk->private_data;
260
261 rbd_put_dev(rbd_dev);
262
263 return 0;
264}
265
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700266static const struct block_device_operations rbd_bd_ops = {
267 .owner = THIS_MODULE,
268 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800269 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270};
271
272/*
273 * Initialize an rbd client instance.
274 * We own *opt.
275 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700276static struct rbd_client *rbd_client_create(struct ceph_options *opt,
277 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700278{
279 struct rbd_client *rbdc;
280 int ret = -ENOMEM;
281
282 dout("rbd_client_create\n");
283 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
284 if (!rbdc)
285 goto out_opt;
286
287 kref_init(&rbdc->kref);
288 INIT_LIST_HEAD(&rbdc->node);
289
Alex Elderbc534d82012-01-29 13:57:44 -0600290 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
291
Sage Weil6ab00d42011-08-09 09:41:59 -0700292 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600294 goto out_mutex;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400295 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296
297 ret = ceph_open_session(rbdc->client);
298 if (ret < 0)
299 goto out_err;
300
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700301 rbdc->rbd_opts = rbd_opts;
302
Alex Elder432b8582012-01-29 13:57:44 -0600303 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600305 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306
Alex Elderbc534d82012-01-29 13:57:44 -0600307 mutex_unlock(&ctl_mutex);
308
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309 dout("rbd_client_create created %p\n", rbdc);
310 return rbdc;
311
312out_err:
313 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600314out_mutex:
315 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316 kfree(rbdc);
317out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400318 if (opt)
319 ceph_destroy_options(opt);
320 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700321}
322
323/*
324 * Find a ceph client with specific addr and configuration.
325 */
326static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
327{
328 struct rbd_client *client_node;
329
330 if (opt->flags & CEPH_OPT_NOSHARE)
331 return NULL;
332
333 list_for_each_entry(client_node, &rbd_client_list, node)
334 if (ceph_compare_options(opt, client_node->client) == 0)
335 return client_node;
336 return NULL;
337}
338
339/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700340 * mount options
341 */
342enum {
343 Opt_notify_timeout,
344 Opt_last_int,
345 /* int args above */
346 Opt_last_string,
347 /* string args above */
348};
349
350static match_table_t rbdopt_tokens = {
351 {Opt_notify_timeout, "notify_timeout=%d"},
352 /* int args above */
353 /* string args above */
354 {-1, NULL}
355};
356
357static int parse_rbd_opts_token(char *c, void *private)
358{
359 struct rbd_options *rbdopt = private;
360 substring_t argstr[MAX_OPT_ARGS];
361 int token, intval, ret;
362
Alex Elder21079782012-01-24 10:08:36 -0600363 token = match_token(c, rbdopt_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700364 if (token < 0)
365 return -EINVAL;
366
367 if (token < Opt_last_int) {
368 ret = match_int(&argstr[0], &intval);
369 if (ret < 0) {
370 pr_err("bad mount option arg (not int) "
371 "at '%s'\n", c);
372 return ret;
373 }
374 dout("got int token %d val %d\n", token, intval);
375 } else if (token > Opt_last_int && token < Opt_last_string) {
376 dout("got string token %d val %s\n", token,
377 argstr[0].from);
378 } else {
379 dout("got token %d\n", token);
380 }
381
382 switch (token) {
383 case Opt_notify_timeout:
384 rbdopt->notify_timeout = intval;
385 break;
386 default:
387 BUG_ON(token);
388 }
389 return 0;
390}
391
392/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700393 * Get a ceph client with specific addr and configuration, if one does
394 * not exist create it.
395 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600396static struct rbd_client *rbd_get_client(const char *mon_addr,
397 size_t mon_addr_len,
398 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700399{
400 struct rbd_client *rbdc;
401 struct ceph_options *opt;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700402 struct rbd_options *rbd_opts;
403
404 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
405 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600406 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700407
408 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700409
Alex Elderee577412012-01-24 10:08:36 -0600410 opt = ceph_parse_options(options, mon_addr,
Alex Elder5214ecc2012-02-02 08:13:30 -0600411 mon_addr + mon_addr_len,
Alex Elder21079782012-01-24 10:08:36 -0600412 parse_rbd_opts_token, rbd_opts);
Alex Elderee577412012-01-24 10:08:36 -0600413 if (IS_ERR(opt)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600414 kfree(rbd_opts);
415 return ERR_CAST(opt);
Alex Elderee577412012-01-24 10:08:36 -0600416 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417
Alex Elder432b8582012-01-29 13:57:44 -0600418 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419 rbdc = __rbd_client_find(opt);
420 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600421 /* using an existing client */
422 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600423 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d3d2012-01-29 13:57:44 -0600424
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700425 ceph_destroy_options(opt);
Alex Elder97bb59a2012-01-24 10:08:36 -0600426 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700427
Alex Elderd720bcb2012-02-02 08:13:30 -0600428 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700429 }
Alex Elder432b8582012-01-29 13:57:44 -0600430 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700432 rbdc = rbd_client_create(opt, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600433
Alex Elderd720bcb2012-02-02 08:13:30 -0600434 if (IS_ERR(rbdc))
435 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436
Alex Elderd720bcb2012-02-02 08:13:30 -0600437 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438}
439
440/*
441 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600442 *
Alex Elder432b8582012-01-29 13:57:44 -0600443 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444 */
445static void rbd_client_release(struct kref *kref)
446{
447 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
448
449 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500450 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500452 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453
454 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700455 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456 kfree(rbdc);
457}
458
459/*
460 * Drop reference to ceph client node. If it's not referenced anymore, release
461 * it.
462 */
463static void rbd_put_client(struct rbd_device *rbd_dev)
464{
465 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
466 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467}
468
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700469/*
470 * Destroy requests collection
471 */
472static void rbd_coll_release(struct kref *kref)
473{
474 struct rbd_req_coll *coll =
475 container_of(kref, struct rbd_req_coll, kref);
476
477 dout("rbd_coll_release %p\n", coll);
478 kfree(coll);
479}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700480
481/*
482 * Create a new header structure, translate header format from the on-disk
483 * header.
484 */
485static int rbd_header_from_disk(struct rbd_image_header *header,
486 struct rbd_image_header_ondisk *ondisk,
Xi Wang50f7c4c2012-04-20 15:49:44 -0500487 u32 allocated_snaps,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488 gfp_t gfp_flags)
489{
Xi Wang50f7c4c2012-04-20 15:49:44 -0500490 u32 i, snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700491
Alex Elder21079782012-01-24 10:08:36 -0600492 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800493 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800494
Alex Elder00f1f362012-02-07 12:03:36 -0600495 snap_count = le32_to_cpu(ondisk->snap_count);
Xi Wang50f7c4c2012-04-20 15:49:44 -0500496 if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
497 / sizeof (*ondisk))
498 return -EINVAL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700499 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Yan, Zhengf9f9a192012-06-06 09:15:33 -0500500 snap_count * sizeof(u64),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700501 gfp_flags);
502 if (!header->snapc)
503 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600504
Alex Elder00f1f362012-02-07 12:03:36 -0600505 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700506 if (snap_count) {
507 header->snap_names = kmalloc(header->snap_names_len,
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500508 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700509 if (!header->snap_names)
510 goto err_snapc;
511 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500512 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700513 if (!header->snap_sizes)
514 goto err_names;
515 } else {
516 header->snap_names = NULL;
517 header->snap_sizes = NULL;
518 }
Alex Elder849b4262012-07-09 21:04:24 -0500519
520 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
521 gfp_flags);
522 if (!header->object_prefix)
523 goto err_sizes;
524
Alex Elderca1e49a2012-07-10 20:30:09 -0500525 memcpy(header->object_prefix, ondisk->block_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700526 sizeof(ondisk->block_name));
Alex Elder849b4262012-07-09 21:04:24 -0500527 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700528
529 header->image_size = le64_to_cpu(ondisk->image_size);
530 header->obj_order = ondisk->options.order;
531 header->crypt_type = ondisk->options.crypt_type;
532 header->comp_type = ondisk->options.comp_type;
533
534 atomic_set(&header->snapc->nref, 1);
535 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
536 header->snapc->num_snaps = snap_count;
537 header->total_snaps = snap_count;
538
Alex Elder21079782012-01-24 10:08:36 -0600539 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540 for (i = 0; i < snap_count; i++) {
541 header->snapc->snaps[i] =
542 le64_to_cpu(ondisk->snaps[i].id);
543 header->snap_sizes[i] =
544 le64_to_cpu(ondisk->snaps[i].image_size);
545 }
546
547 /* copy snapshot names */
548 memcpy(header->snap_names, &ondisk->snaps[i],
549 header->snap_names_len);
550 }
551
552 return 0;
553
Alex Elder849b4262012-07-09 21:04:24 -0500554err_sizes:
555 kfree(header->snap_sizes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700556err_names:
557 kfree(header->snap_names);
558err_snapc:
559 kfree(header->snapc);
Alex Elder00f1f362012-02-07 12:03:36 -0600560 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700561}
562
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700563static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
564 u64 *seq, u64 *size)
565{
566 int i;
567 char *p = header->snap_names;
568
Alex Elder00f1f362012-02-07 12:03:36 -0600569 for (i = 0; i < header->total_snaps; i++) {
570 if (!strcmp(snap_name, p)) {
571
572 /* Found it. Pass back its id and/or size */
573
574 if (seq)
575 *seq = header->snapc->snaps[i];
576 if (size)
577 *size = header->snap_sizes[i];
578 return i;
579 }
580 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700581 }
Alex Elder00f1f362012-02-07 12:03:36 -0600582 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700583}
584
Alex Elder0ce1a792012-07-03 16:01:18 -0500585static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586{
Alex Elder0ce1a792012-07-03 16:01:18 -0500587 struct rbd_image_header *header = &rbd_dev->header;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700588 struct ceph_snap_context *snapc = header->snapc;
589 int ret = -ENOENT;
590
Alex Elder0ce1a792012-07-03 16:01:18 -0500591 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592
Alex Elder0ce1a792012-07-03 16:01:18 -0500593 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800594 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700595 if (header->total_snaps)
596 snapc->seq = header->snap_seq;
597 else
598 snapc->seq = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -0500599 rbd_dev->snap_id = CEPH_NOSNAP;
600 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700601 if (size)
602 *size = header->image_size;
603 } else {
Alex Elder0ce1a792012-07-03 16:01:18 -0500604 ret = snap_by_name(header, rbd_dev->snap_name,
605 &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606 if (ret < 0)
607 goto done;
Alex Elder0ce1a792012-07-03 16:01:18 -0500608 rbd_dev->snap_id = snapc->seq;
609 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610 }
611
612 ret = 0;
613done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500614 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700615 return ret;
616}
617
618static void rbd_header_free(struct rbd_image_header *header)
619{
Alex Elder849b4262012-07-09 21:04:24 -0500620 kfree(header->object_prefix);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621 kfree(header->snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -0500622 kfree(header->snap_names);
623 kfree(header->snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624}
625
626/*
627 * get the actual striped segment name, offset and length
628 */
629static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500630 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631 u64 ofs, u64 len,
632 char *seg_name, u64 *segofs)
633{
634 u64 seg = ofs >> header->obj_order;
635
636 if (seg_name)
637 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500638 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639
640 ofs = ofs & ((1 << header->obj_order) - 1);
641 len = min_t(u64, len, (1 << header->obj_order) - ofs);
642
643 if (segofs)
644 *segofs = ofs;
645
646 return len;
647}
648
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700649static int rbd_get_num_segments(struct rbd_image_header *header,
650 u64 ofs, u64 len)
651{
652 u64 start_seg = ofs >> header->obj_order;
653 u64 end_seg = (ofs + len - 1) >> header->obj_order;
654 return end_seg - start_seg + 1;
655}
656
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700658 * returns the size of an object in the image
659 */
660static u64 rbd_obj_bytes(struct rbd_image_header *header)
661{
662 return 1 << header->obj_order;
663}
664
665/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 * bio helpers
667 */
668
669static void bio_chain_put(struct bio *chain)
670{
671 struct bio *tmp;
672
673 while (chain) {
674 tmp = chain;
675 chain = chain->bi_next;
676 bio_put(tmp);
677 }
678}
679
680/*
681 * zeros a bio chain, starting at specific offset
682 */
683static void zero_bio_chain(struct bio *chain, int start_ofs)
684{
685 struct bio_vec *bv;
686 unsigned long flags;
687 void *buf;
688 int i;
689 int pos = 0;
690
691 while (chain) {
692 bio_for_each_segment(bv, chain, i) {
693 if (pos + bv->bv_len > start_ofs) {
694 int remainder = max(start_ofs - pos, 0);
695 buf = bvec_kmap_irq(bv, &flags);
696 memset(buf + remainder, 0,
697 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200698 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 }
700 pos += bv->bv_len;
701 }
702
703 chain = chain->bi_next;
704 }
705}
706
707/*
708 * bio_chain_clone - clone a chain of bios up to a certain length.
709 * might return a bio_pair that will need to be released.
710 */
711static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
712 struct bio_pair **bp,
713 int len, gfp_t gfpmask)
714{
715 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
716 int total = 0;
717
718 if (*bp) {
719 bio_pair_release(*bp);
720 *bp = NULL;
721 }
722
723 while (old_chain && (total < len)) {
724 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
725 if (!tmp)
726 goto err_out;
727
728 if (total + old_chain->bi_size > len) {
729 struct bio_pair *bp;
730
731 /*
732 * this split can only happen with a single paged bio,
733 * split_bio will BUG_ON if this is not the case
734 */
735 dout("bio_chain_clone split! total=%d remaining=%d"
736 "bi_size=%d\n",
737 (int)total, (int)len-total,
738 (int)old_chain->bi_size);
739
740 /* split the bio. We'll release it either in the next
741 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600742 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743 if (!bp)
744 goto err_out;
745
746 __bio_clone(tmp, &bp->bio1);
747
748 *next = &bp->bio2;
749 } else {
750 __bio_clone(tmp, old_chain);
751 *next = old_chain->bi_next;
752 }
753
754 tmp->bi_bdev = NULL;
755 gfpmask &= ~__GFP_WAIT;
756 tmp->bi_next = NULL;
757
758 if (!new_chain) {
759 new_chain = tail = tmp;
760 } else {
761 tail->bi_next = tmp;
762 tail = tmp;
763 }
764 old_chain = old_chain->bi_next;
765
766 total += tmp->bi_size;
767 }
768
769 BUG_ON(total < len);
770
771 if (tail)
772 tail->bi_next = NULL;
773
774 *old = old_chain;
775
776 return new_chain;
777
778err_out:
779 dout("bio_chain_clone with err\n");
780 bio_chain_put(new_chain);
781 return NULL;
782}
783
784/*
785 * helpers for osd request op vectors.
786 */
787static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
788 int num_ops,
789 int opcode,
790 u32 payload_len)
791{
792 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
793 GFP_NOIO);
794 if (!*ops)
795 return -ENOMEM;
796 (*ops)[0].op = opcode;
797 /*
798 * op extent offset and length will be set later on
799 * in calc_raw_layout()
800 */
801 (*ops)[0].payload_len = payload_len;
802 return 0;
803}
804
805static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
806{
807 kfree(ops);
808}
809
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700810static void rbd_coll_end_req_index(struct request *rq,
811 struct rbd_req_coll *coll,
812 int index,
813 int ret, u64 len)
814{
815 struct request_queue *q;
816 int min, max, i;
817
818 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
819 coll, index, ret, len);
820
821 if (!rq)
822 return;
823
824 if (!coll) {
825 blk_end_request(rq, ret, len);
826 return;
827 }
828
829 q = rq->q;
830
831 spin_lock_irq(q->queue_lock);
832 coll->status[index].done = 1;
833 coll->status[index].rc = ret;
834 coll->status[index].bytes = len;
835 max = min = coll->num_done;
836 while (max < coll->total && coll->status[max].done)
837 max++;
838
839 for (i = min; i<max; i++) {
840 __blk_end_request(rq, coll->status[i].rc,
841 coll->status[i].bytes);
842 coll->num_done++;
843 kref_put(&coll->kref, rbd_coll_release);
844 }
845 spin_unlock_irq(q->queue_lock);
846}
847
848static void rbd_coll_end_req(struct rbd_request *req,
849 int ret, u64 len)
850{
851 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
852}
853
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854/*
855 * Send ceph osd request
856 */
857static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500858 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859 struct ceph_snap_context *snapc,
860 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500861 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700862 struct bio *bio,
863 struct page **pages,
864 int num_pages,
865 int flags,
866 struct ceph_osd_req_op *ops,
867 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700868 struct rbd_req_coll *coll,
869 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700870 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700871 struct ceph_msg *msg),
872 struct ceph_osd_request **linger_req,
873 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874{
875 struct ceph_osd_request *req;
876 struct ceph_file_layout *layout;
877 int ret;
878 u64 bno;
879 struct timespec mtime = CURRENT_TIME;
880 struct rbd_request *req_data;
881 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600882 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700883
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700885 if (!req_data) {
886 if (coll)
887 rbd_coll_end_req_index(rq, coll, coll_index,
888 -ENOMEM, len);
889 return -ENOMEM;
890 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700891
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700892 if (coll) {
893 req_data->coll = coll;
894 req_data->coll_index = coll_index;
895 }
896
Alex Elderaded07e2012-07-03 16:01:18 -0500897 dout("rbd_do_request object_name=%s ofs=%lld len=%lld\n",
898 object_name, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700899
Alex Elder0ce1a792012-07-03 16:01:18 -0500900 down_read(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901
Alex Elder0ce1a792012-07-03 16:01:18 -0500902 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600903 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
904 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700905 if (!req) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500906 up_read(&rbd_dev->header_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700907 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700908 goto done_pages;
909 }
910
911 req->r_callback = rbd_cb;
912
913 req_data->rq = rq;
914 req_data->bio = bio;
915 req_data->pages = pages;
916 req_data->len = len;
917
918 req->r_priv = req_data;
919
920 reqhead = req->r_request->front.iov_base;
921 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
922
Alex Elderaded07e2012-07-03 16:01:18 -0500923 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700924 req->r_oid_len = strlen(req->r_oid);
925
926 layout = &req->r_file_layout;
927 memset(layout, 0, sizeof(*layout));
928 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
929 layout->fl_stripe_count = cpu_to_le32(1);
930 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500931 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600932 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
933 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700934
935 ceph_osdc_build_request(req, ofs, &len,
936 ops,
937 snapc,
938 &mtime,
939 req->r_oid, req->r_oid_len);
Alex Elder0ce1a792012-07-03 16:01:18 -0500940 up_read(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700941
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700942 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600943 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700944 *linger_req = req;
945 }
946
Alex Elder1dbb4392012-01-24 10:08:37 -0600947 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700948 if (ret < 0)
949 goto done_err;
950
951 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600952 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700953 if (ver)
954 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700955 dout("reassert_ver=%lld\n",
956 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700957 ceph_osdc_put_request(req);
958 }
959 return ret;
960
961done_err:
962 bio_chain_put(req_data->bio);
963 ceph_osdc_put_request(req);
964done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700965 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 return ret;
968}
969
970/*
971 * Ceph osd op callback
972 */
973static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
974{
975 struct rbd_request *req_data = req->r_priv;
976 struct ceph_osd_reply_head *replyhead;
977 struct ceph_osd_op *op;
978 __s32 rc;
979 u64 bytes;
980 int read_op;
981
982 /* parse reply */
983 replyhead = msg->front.iov_base;
984 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
985 op = (void *)(replyhead + 1);
986 rc = le32_to_cpu(replyhead->result);
987 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -0500988 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700989
990 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
991
992 if (rc == -ENOENT && read_op) {
993 zero_bio_chain(req_data->bio, 0);
994 rc = 0;
995 } else if (rc == 0 && read_op && bytes < req_data->len) {
996 zero_bio_chain(req_data->bio, bytes);
997 bytes = req_data->len;
998 }
999
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001000 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001001
1002 if (req_data->bio)
1003 bio_chain_put(req_data->bio);
1004
1005 ceph_osdc_put_request(req);
1006 kfree(req_data);
1007}
1008
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001009static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1010{
1011 ceph_osdc_put_request(req);
1012}
1013
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001014/*
1015 * Do a synchronous ceph osd operation
1016 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001017static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001018 struct ceph_snap_context *snapc,
1019 u64 snapid,
1020 int opcode,
1021 int flags,
1022 struct ceph_osd_req_op *orig_ops,
1023 int num_reply,
Alex Elderaded07e2012-07-03 16:01:18 -05001024 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001025 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001026 char *buf,
1027 struct ceph_osd_request **linger_req,
1028 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001029{
1030 int ret;
1031 struct page **pages;
1032 int num_pages;
1033 struct ceph_osd_req_op *ops = orig_ops;
1034 u32 payload_len;
1035
1036 num_pages = calc_pages_for(ofs , len);
1037 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001038 if (IS_ERR(pages))
1039 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040
1041 if (!orig_ops) {
1042 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1043 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1044 if (ret < 0)
1045 goto done;
1046
1047 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1048 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1049 if (ret < 0)
1050 goto done_ops;
1051 }
1052 }
1053
Alex Elder0ce1a792012-07-03 16:01:18 -05001054 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001055 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056 pages, num_pages,
1057 flags,
1058 ops,
1059 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001060 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001061 NULL,
1062 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001063 if (ret < 0)
1064 goto done_ops;
1065
1066 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1067 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1068
1069done_ops:
1070 if (!orig_ops)
1071 rbd_destroy_ops(ops);
1072done:
1073 ceph_release_page_vector(pages, num_pages);
1074 return ret;
1075}
1076
1077/*
1078 * Do an asynchronous ceph osd operation
1079 */
1080static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001081 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001082 struct ceph_snap_context *snapc,
1083 u64 snapid,
1084 int opcode, int flags, int num_reply,
1085 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001086 struct bio *bio,
1087 struct rbd_req_coll *coll,
1088 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001089{
1090 char *seg_name;
1091 u64 seg_ofs;
1092 u64 seg_len;
1093 int ret;
1094 struct ceph_osd_req_op *ops;
1095 u32 payload_len;
1096
1097 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1098 if (!seg_name)
1099 return -ENOMEM;
1100
1101 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001102 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103 ofs, len,
1104 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001105
1106 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1107
1108 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1109 if (ret < 0)
1110 goto done;
1111
1112 /* we've taken care of segment sizes earlier when we
1113 cloned the bios. We should never have a segment
1114 truncated at this point */
1115 BUG_ON(seg_len < len);
1116
1117 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1118 seg_name, seg_ofs, seg_len,
1119 bio,
1120 NULL, 0,
1121 flags,
1122 ops,
1123 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001124 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001125 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001126
1127 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001128done:
1129 kfree(seg_name);
1130 return ret;
1131}
1132
1133/*
1134 * Request async osd write
1135 */
1136static int rbd_req_write(struct request *rq,
1137 struct rbd_device *rbd_dev,
1138 struct ceph_snap_context *snapc,
1139 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001140 struct bio *bio,
1141 struct rbd_req_coll *coll,
1142 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143{
1144 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1145 CEPH_OSD_OP_WRITE,
1146 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1147 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001148 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149}
1150
1151/*
1152 * Request async osd read
1153 */
1154static int rbd_req_read(struct request *rq,
1155 struct rbd_device *rbd_dev,
1156 u64 snapid,
1157 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001158 struct bio *bio,
1159 struct rbd_req_coll *coll,
1160 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161{
1162 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001163 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164 CEPH_OSD_OP_READ,
1165 CEPH_OSD_FLAG_READ,
1166 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001167 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168}
1169
1170/*
1171 * Request sync osd read
1172 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001173static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001174 struct ceph_snap_context *snapc,
1175 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001176 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001177 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001178 char *buf,
1179 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180{
Alex Elder0ce1a792012-07-03 16:01:18 -05001181 return rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001182 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001183 CEPH_OSD_OP_READ,
1184 CEPH_OSD_FLAG_READ,
1185 NULL,
Alex Elderaded07e2012-07-03 16:01:18 -05001186 1, object_name, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001187}
1188
1189/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001190 * Request sync osd watch
1191 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001192static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001193 u64 ver,
1194 u64 notify_id,
Alex Elderaded07e2012-07-03 16:01:18 -05001195 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001196{
1197 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001198 int ret;
1199
1200 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001201 if (ret < 0)
1202 return ret;
1203
Alex Elder0ce1a792012-07-03 16:01:18 -05001204 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001205 ops[0].watch.cookie = notify_id;
1206 ops[0].watch.flag = 0;
1207
Alex Elder0ce1a792012-07-03 16:01:18 -05001208 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderaded07e2012-07-03 16:01:18 -05001209 object_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001210 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001211 CEPH_OSD_FLAG_READ,
1212 ops,
1213 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001214 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001215 rbd_simple_req_cb, 0, NULL);
1216
1217 rbd_destroy_ops(ops);
1218 return ret;
1219}
1220
1221static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1222{
Alex Elder0ce1a792012-07-03 16:01:18 -05001223 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001224 int rc;
1225
Alex Elder0ce1a792012-07-03 16:01:18 -05001226 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001227 return;
1228
Alex Elder0bed54d2012-07-03 16:01:18 -05001229 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n",
1230 rbd_dev->header_name, notify_id, (int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001231 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder0ce1a792012-07-03 16:01:18 -05001232 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001233 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001234 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001235 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001236 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001237
Alex Elder0bed54d2012-07-03 16:01:18 -05001238 rbd_req_sync_notify_ack(rbd_dev, ver, notify_id, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001239}
1240
1241/*
1242 * Request sync osd watch
1243 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001244static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001245 const char *object_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001246 u64 ver)
1247{
1248 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001249 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001250
1251 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1252 if (ret < 0)
1253 return ret;
1254
1255 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001256 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001257 if (ret < 0)
1258 goto fail;
1259
1260 ops[0].watch.ver = cpu_to_le64(ver);
Alex Elder0ce1a792012-07-03 16:01:18 -05001261 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001262 ops[0].watch.flag = 1;
1263
Alex Elder0ce1a792012-07-03 16:01:18 -05001264 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001265 CEPH_NOSNAP,
1266 0,
1267 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1268 ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001269 1, object_name, 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001270 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001271
1272 if (ret < 0)
1273 goto fail_event;
1274
1275 rbd_destroy_ops(ops);
1276 return 0;
1277
1278fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001279 ceph_osdc_cancel_event(rbd_dev->watch_event);
1280 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001281fail:
1282 rbd_destroy_ops(ops);
1283 return ret;
1284}
1285
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001286/*
1287 * Request sync osd unwatch
1288 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001289static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001290 const char *object_name)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001291{
1292 struct ceph_osd_req_op *ops;
1293
1294 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1295 if (ret < 0)
1296 return ret;
1297
1298 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001299 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001300 ops[0].watch.flag = 0;
1301
Alex Elder0ce1a792012-07-03 16:01:18 -05001302 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001303 CEPH_NOSNAP,
1304 0,
1305 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1306 ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001307 1, object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001308
1309 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001310 ceph_osdc_cancel_event(rbd_dev->watch_event);
1311 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001312 return ret;
1313}
1314
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001315struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001316 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001317};
1318
1319static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1320{
Alex Elder0ce1a792012-07-03 16:01:18 -05001321 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1322 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001323 return;
1324
Alex Elder0ce1a792012-07-03 16:01:18 -05001325 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
Alex Elder0bed54d2012-07-03 16:01:18 -05001326 rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327 notify_id, (int)opcode);
1328}
1329
1330/*
1331 * Request sync osd notify
1332 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001333static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001334 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001335{
1336 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001337 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338 struct ceph_osd_event *event;
1339 struct rbd_notify_info info;
1340 int payload_len = sizeof(u32) + sizeof(u32);
1341 int ret;
1342
1343 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1344 if (ret < 0)
1345 return ret;
1346
Alex Elder0ce1a792012-07-03 16:01:18 -05001347 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001348
1349 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1350 (void *)&info, &event);
1351 if (ret < 0)
1352 goto fail;
1353
1354 ops[0].watch.ver = 1;
1355 ops[0].watch.flag = 1;
1356 ops[0].watch.cookie = event->cookie;
1357 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1358 ops[0].watch.timeout = 12;
1359
Alex Elder0ce1a792012-07-03 16:01:18 -05001360 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001361 CEPH_NOSNAP,
1362 0,
1363 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1364 ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001365 1, object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001366 if (ret < 0)
1367 goto fail_event;
1368
1369 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1370 dout("ceph_osdc_wait_event returned %d\n", ret);
1371 rbd_destroy_ops(ops);
1372 return 0;
1373
1374fail_event:
1375 ceph_osdc_cancel_event(event);
1376fail:
1377 rbd_destroy_ops(ops);
1378 return ret;
1379}
1380
1381/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001382 * Request sync osd read
1383 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001384static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001385 const char *object_name,
1386 const char *class_name,
1387 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001388 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001389 int len,
1390 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001391{
1392 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001393 int class_name_len = strlen(class_name);
1394 int method_name_len = strlen(method_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001395 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001396 class_name_len + method_name_len + len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001397 if (ret < 0)
1398 return ret;
1399
Alex Elderaded07e2012-07-03 16:01:18 -05001400 ops[0].cls.class_name = class_name;
1401 ops[0].cls.class_len = (__u8) class_name_len;
1402 ops[0].cls.method_name = method_name;
1403 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001404 ops[0].cls.argc = 0;
1405 ops[0].cls.indata = data;
1406 ops[0].cls.indata_len = len;
1407
Alex Elder0ce1a792012-07-03 16:01:18 -05001408 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001409 CEPH_NOSNAP,
1410 0,
1411 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1412 ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001413 1, object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001414
1415 rbd_destroy_ops(ops);
1416
1417 dout("cls_exec returned %d\n", ret);
1418 return ret;
1419}
1420
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001421static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1422{
1423 struct rbd_req_coll *coll =
1424 kzalloc(sizeof(struct rbd_req_coll) +
1425 sizeof(struct rbd_req_status) * num_reqs,
1426 GFP_ATOMIC);
1427
1428 if (!coll)
1429 return NULL;
1430 coll->total = num_reqs;
1431 kref_init(&coll->kref);
1432 return coll;
1433}
1434
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001435/*
1436 * block device queue callback
1437 */
1438static void rbd_rq_fn(struct request_queue *q)
1439{
1440 struct rbd_device *rbd_dev = q->queuedata;
1441 struct request *rq;
1442 struct bio_pair *bp = NULL;
1443
Alex Elder00f1f362012-02-07 12:03:36 -06001444 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001445 struct bio *bio;
1446 struct bio *rq_bio, *next_bio = NULL;
1447 bool do_write;
1448 int size, op_size = 0;
1449 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001450 int num_segs, cur_seg = 0;
1451 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001452
1453 /* peek at request from block layer */
1454 if (!rq)
1455 break;
1456
1457 dout("fetched request\n");
1458
1459 /* filter out block requests we don't understand */
1460 if ((rq->cmd_type != REQ_TYPE_FS)) {
1461 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001462 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001463 }
1464
1465 /* deduce our operation (read, write) */
1466 do_write = (rq_data_dir(rq) == WRITE);
1467
1468 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001469 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001470 rq_bio = rq->bio;
1471 if (do_write && rbd_dev->read_only) {
1472 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001473 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474 }
1475
1476 spin_unlock_irq(q->queue_lock);
1477
1478 dout("%s 0x%x bytes at 0x%llx\n",
1479 do_write ? "write" : "read",
Alex Elder593a9e72012-02-07 12:03:37 -06001480 size, blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001481
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001482 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1483 coll = rbd_alloc_coll(num_segs);
1484 if (!coll) {
1485 spin_lock_irq(q->queue_lock);
1486 __blk_end_request_all(rq, -ENOMEM);
Alex Elder00f1f362012-02-07 12:03:36 -06001487 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001488 }
1489
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490 do {
1491 /* a bio clone to be passed down to OSD req */
1492 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1493 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001494 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001495 ofs, size,
1496 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001497 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001498 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1499 op_size, GFP_ATOMIC);
1500 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001501 rbd_coll_end_req_index(rq, coll, cur_seg,
1502 -ENOMEM, op_size);
1503 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001504 }
1505
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001506
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001507 /* init OSD command: write or read */
1508 if (do_write)
1509 rbd_req_write(rq, rbd_dev,
1510 rbd_dev->header.snapc,
1511 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001512 op_size, bio,
1513 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001514 else
1515 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001516 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001517 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001518 op_size, bio,
1519 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522 size -= op_size;
1523 ofs += op_size;
1524
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001525 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001526 rq_bio = next_bio;
1527 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001528 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529
1530 if (bp)
1531 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001532 spin_lock_irq(q->queue_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533 }
1534}
1535
1536/*
1537 * a queue callback. Makes sure that we don't create a bio that spans across
1538 * multiple osd objects. One exception would be with a single page bios,
1539 * which we handle later at bio_chain_clone
1540 */
1541static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1542 struct bio_vec *bvec)
1543{
1544 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001545 unsigned int chunk_sectors;
1546 sector_t sector;
1547 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 int max;
1549
Alex Elder593a9e72012-02-07 12:03:37 -06001550 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1551 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1552 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1553
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001554 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001555 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001556 if (max < 0)
1557 max = 0; /* bio_add cannot handle a negative return */
1558 if (max <= bvec->bv_len && bio_sectors == 0)
1559 return bvec->bv_len;
1560 return max;
1561}
1562
1563static void rbd_free_disk(struct rbd_device *rbd_dev)
1564{
1565 struct gendisk *disk = rbd_dev->disk;
1566
1567 if (!disk)
1568 return;
1569
1570 rbd_header_free(&rbd_dev->header);
1571
1572 if (disk->flags & GENHD_FL_UP)
1573 del_gendisk(disk);
1574 if (disk->queue)
1575 blk_cleanup_queue(disk->queue);
1576 put_disk(disk);
1577}
1578
1579/*
1580 * reload the ondisk the header
1581 */
1582static int rbd_read_header(struct rbd_device *rbd_dev,
1583 struct rbd_image_header *header)
1584{
1585 ssize_t rc;
1586 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001587 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001588 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001589 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590
Alex Elder00f1f362012-02-07 12:03:36 -06001591 /*
1592 * First reads the fixed-size header to determine the number
1593 * of snapshots, then re-reads it, along with all snapshot
1594 * records as well as their stored names.
1595 */
1596 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001597 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001598 dh = kmalloc(len, GFP_KERNEL);
1599 if (!dh)
1600 return -ENOMEM;
1601
1602 rc = rbd_req_sync_read(rbd_dev,
1603 NULL, CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001604 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001605 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001606 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001607 if (rc < 0)
1608 goto out_dh;
1609
1610 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001611 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001612 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001613 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001614 " for image %s\n",
1615 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001616 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001617 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001618
Alex Elder00f1f362012-02-07 12:03:36 -06001619 if (snap_count == header->total_snaps)
1620 break;
1621
1622 snap_count = header->total_snaps;
1623 len = sizeof (*dh) +
1624 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1625 header->snap_names_len;
1626
1627 rbd_header_free(header);
1628 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001629 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001630 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001631
1632out_dh:
1633 kfree(dh);
1634 return rc;
1635}
1636
1637/*
1638 * create a snapshot
1639 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001640static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001641 const char *snap_name,
1642 gfp_t gfp_flags)
1643{
1644 int name_len = strlen(snap_name);
1645 u64 new_snapid;
1646 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001647 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001648 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001649 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001650
1651 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001652 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001653 return -EINVAL;
1654
Alex Elder0ce1a792012-07-03 16:01:18 -05001655 monc = &rbd_dev->rbd_client->client->monc;
1656 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001657 dout("created snapid=%lld\n", new_snapid);
1658 if (ret < 0)
1659 return ret;
1660
1661 data = kmalloc(name_len + 16, gfp_flags);
1662 if (!data)
1663 return -ENOMEM;
1664
Sage Weil916d4d62011-05-12 16:10:50 -07001665 p = data;
1666 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001667
Sage Weil916d4d62011-05-12 16:10:50 -07001668 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1669 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001670
Alex Elder0bed54d2012-07-03 16:01:18 -05001671 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001672 "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001673 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001674
Sage Weil916d4d62011-05-12 16:10:50 -07001675 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001676
1677 if (ret < 0)
1678 return ret;
1679
Alex Elder0ce1a792012-07-03 16:01:18 -05001680 down_write(&rbd_dev->header_rwsem);
1681 rbd_dev->header.snapc->seq = new_snapid;
1682 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001683
1684 return 0;
1685bad:
1686 return -ERANGE;
1687}
1688
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001689static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1690{
1691 struct rbd_snap *snap;
1692
1693 while (!list_empty(&rbd_dev->snaps)) {
1694 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1695 __rbd_remove_snap_dev(rbd_dev, snap);
1696 }
1697}
1698
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001699/*
1700 * only read the first part of the ondisk header, without the snaps info
1701 */
Josh Durgin263c6ca2011-12-05 10:43:42 -08001702static int __rbd_refresh_header(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001703{
1704 int ret;
1705 struct rbd_image_header h;
1706 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001707 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001708
1709 ret = rbd_read_header(rbd_dev, &h);
1710 if (ret < 0)
1711 return ret;
1712
Sage Weil9db4b3e2011-04-19 22:49:06 -07001713 /* resized? */
Alex Elder593a9e72012-02-07 12:03:37 -06001714 set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001715
Josh Durginc6666012011-11-21 17:11:12 -08001716 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001717
1718 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001719 if (rbd_dev->header.total_snaps &&
1720 rbd_dev->header.snapc->snaps[0] == snap_seq)
1721 /* pointing at the head, will need to follow that
1722 if head moves */
1723 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001724
Alex Elder849b4262012-07-09 21:04:24 -05001725 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001726 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001727 kfree(rbd_dev->header.snap_names);
1728 kfree(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001729
1730 rbd_dev->header.total_snaps = h.total_snaps;
1731 rbd_dev->header.snapc = h.snapc;
1732 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001733 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001734 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001735 /* Free the extra copy of the object prefix */
1736 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1737 kfree(h.object_prefix);
1738
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001739 if (follow_seq)
1740 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1741 else
1742 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001743
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001744 ret = __rbd_init_snaps_header(rbd_dev);
1745
Josh Durginc6666012011-11-21 17:11:12 -08001746 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001747
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001748 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001749}
1750
1751static int rbd_init_disk(struct rbd_device *rbd_dev)
1752{
1753 struct gendisk *disk;
1754 struct request_queue *q;
1755 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001756 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001757 u64 total_size = 0;
1758
1759 /* contact OSD, request size info about the object being mapped */
1760 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1761 if (rc)
1762 return rc;
1763
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001764 /* no need to lock here, as rbd_dev is not registered yet */
1765 rc = __rbd_init_snaps_header(rbd_dev);
1766 if (rc)
1767 return rc;
1768
Josh Durgincc9d7342011-11-21 18:19:13 -08001769 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001770 if (rc)
1771 return rc;
1772
1773 /* create gendisk info */
1774 rc = -ENOMEM;
1775 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1776 if (!disk)
1777 goto out;
1778
Alex Elderf0f8cef2012-01-29 13:57:44 -06001779 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001780 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001781 disk->major = rbd_dev->major;
1782 disk->first_minor = 0;
1783 disk->fops = &rbd_bd_ops;
1784 disk->private_data = rbd_dev;
1785
1786 /* init rq */
1787 rc = -ENOMEM;
1788 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1789 if (!q)
1790 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001791
Alex Elder593a9e72012-02-07 12:03:37 -06001792 /* We use the default size, but let's be explicit about it. */
1793 blk_queue_physical_block_size(q, SECTOR_SIZE);
1794
Josh Durgin029bcbd2011-07-22 11:35:23 -07001795 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001796 segment_size = rbd_obj_bytes(&rbd_dev->header);
1797 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1798 blk_queue_max_segment_size(q, segment_size);
1799 blk_queue_io_min(q, segment_size);
1800 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001801
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001802 blk_queue_merge_bvec(q, rbd_merge_bvec);
1803 disk->queue = q;
1804
1805 q->queuedata = rbd_dev;
1806
1807 rbd_dev->disk = disk;
1808 rbd_dev->q = q;
1809
1810 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001811 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812 add_disk(disk);
1813
1814 pr_info("%s: added with size 0x%llx\n",
1815 disk->disk_name, (unsigned long long)total_size);
1816 return 0;
1817
1818out_disk:
1819 put_disk(disk);
1820out:
1821 return rc;
1822}
1823
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001824/*
1825 sysfs
1826*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001827
Alex Elder593a9e72012-02-07 12:03:37 -06001828static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1829{
1830 return container_of(dev, struct rbd_device, dev);
1831}
1832
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001833static ssize_t rbd_size_show(struct device *dev,
1834 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001835{
Alex Elder593a9e72012-02-07 12:03:37 -06001836 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001837
1838 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001839}
1840
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001841static ssize_t rbd_major_show(struct device *dev,
1842 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001843{
Alex Elder593a9e72012-02-07 12:03:37 -06001844 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001845
1846 return sprintf(buf, "%d\n", rbd_dev->major);
1847}
1848
1849static ssize_t rbd_client_id_show(struct device *dev,
1850 struct device_attribute *attr, char *buf)
1851{
Alex Elder593a9e72012-02-07 12:03:37 -06001852 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001853
Alex Elder1dbb4392012-01-24 10:08:37 -06001854 return sprintf(buf, "client%lld\n",
1855 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001856}
1857
1858static ssize_t rbd_pool_show(struct device *dev,
1859 struct device_attribute *attr, char *buf)
1860{
Alex Elder593a9e72012-02-07 12:03:37 -06001861 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001862
1863 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1864}
1865
Alex Elder9bb2f332012-07-12 10:46:35 -05001866static ssize_t rbd_pool_id_show(struct device *dev,
1867 struct device_attribute *attr, char *buf)
1868{
1869 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1870
1871 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1872}
1873
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001874static ssize_t rbd_name_show(struct device *dev,
1875 struct device_attribute *attr, char *buf)
1876{
Alex Elder593a9e72012-02-07 12:03:37 -06001877 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001878
Alex Elder0bed54d2012-07-03 16:01:18 -05001879 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001880}
1881
1882static ssize_t rbd_snap_show(struct device *dev,
1883 struct device_attribute *attr,
1884 char *buf)
1885{
Alex Elder593a9e72012-02-07 12:03:37 -06001886 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001887
1888 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1889}
1890
1891static ssize_t rbd_image_refresh(struct device *dev,
1892 struct device_attribute *attr,
1893 const char *buf,
1894 size_t size)
1895{
Alex Elder593a9e72012-02-07 12:03:37 -06001896 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001897 int rc;
1898 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001899
1900 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1901
Josh Durgin263c6ca2011-12-05 10:43:42 -08001902 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001903 if (rc < 0)
1904 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001905
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001906 mutex_unlock(&ctl_mutex);
1907 return ret;
1908}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001909
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001910static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1911static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1912static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1913static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001914static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001915static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1916static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1917static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1918static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001919
1920static struct attribute *rbd_attrs[] = {
1921 &dev_attr_size.attr,
1922 &dev_attr_major.attr,
1923 &dev_attr_client_id.attr,
1924 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001925 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001926 &dev_attr_name.attr,
1927 &dev_attr_current_snap.attr,
1928 &dev_attr_refresh.attr,
1929 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001930 NULL
1931};
1932
1933static struct attribute_group rbd_attr_group = {
1934 .attrs = rbd_attrs,
1935};
1936
1937static const struct attribute_group *rbd_attr_groups[] = {
1938 &rbd_attr_group,
1939 NULL
1940};
1941
1942static void rbd_sysfs_dev_release(struct device *dev)
1943{
1944}
1945
1946static struct device_type rbd_device_type = {
1947 .name = "rbd",
1948 .groups = rbd_attr_groups,
1949 .release = rbd_sysfs_dev_release,
1950};
1951
1952
1953/*
1954 sysfs - snapshots
1955*/
1956
1957static ssize_t rbd_snap_size_show(struct device *dev,
1958 struct device_attribute *attr,
1959 char *buf)
1960{
1961 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1962
Josh Durgin35915382011-12-05 18:25:13 -08001963 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001964}
1965
1966static ssize_t rbd_snap_id_show(struct device *dev,
1967 struct device_attribute *attr,
1968 char *buf)
1969{
1970 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1971
Josh Durgin35915382011-12-05 18:25:13 -08001972 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001973}
1974
1975static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1976static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1977
1978static struct attribute *rbd_snap_attrs[] = {
1979 &dev_attr_snap_size.attr,
1980 &dev_attr_snap_id.attr,
1981 NULL,
1982};
1983
1984static struct attribute_group rbd_snap_attr_group = {
1985 .attrs = rbd_snap_attrs,
1986};
1987
1988static void rbd_snap_dev_release(struct device *dev)
1989{
1990 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1991 kfree(snap->name);
1992 kfree(snap);
1993}
1994
1995static const struct attribute_group *rbd_snap_attr_groups[] = {
1996 &rbd_snap_attr_group,
1997 NULL
1998};
1999
2000static struct device_type rbd_snap_device_type = {
2001 .groups = rbd_snap_attr_groups,
2002 .release = rbd_snap_dev_release,
2003};
2004
2005static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
2006 struct rbd_snap *snap)
2007{
2008 list_del(&snap->node);
2009 device_unregister(&snap->dev);
2010}
2011
2012static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2013 struct rbd_snap *snap,
2014 struct device *parent)
2015{
2016 struct device *dev = &snap->dev;
2017 int ret;
2018
2019 dev->type = &rbd_snap_device_type;
2020 dev->parent = parent;
2021 dev->release = rbd_snap_dev_release;
2022 dev_set_name(dev, "snap_%s", snap->name);
2023 ret = device_register(dev);
2024
2025 return ret;
2026}
2027
2028static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2029 int i, const char *name,
2030 struct rbd_snap **snapp)
2031{
2032 int ret;
2033 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2034 if (!snap)
2035 return -ENOMEM;
2036 snap->name = kstrdup(name, GFP_KERNEL);
2037 snap->size = rbd_dev->header.snap_sizes[i];
2038 snap->id = rbd_dev->header.snapc->snaps[i];
2039 if (device_is_registered(&rbd_dev->dev)) {
2040 ret = rbd_register_snap_dev(rbd_dev, snap,
2041 &rbd_dev->dev);
2042 if (ret < 0)
2043 goto err;
2044 }
2045 *snapp = snap;
2046 return 0;
2047err:
2048 kfree(snap->name);
2049 kfree(snap);
2050 return ret;
2051}
2052
2053/*
2054 * search for the previous snap in a null delimited string list
2055 */
2056const char *rbd_prev_snap_name(const char *name, const char *start)
2057{
2058 if (name < start + 2)
2059 return NULL;
2060
2061 name -= 2;
2062 while (*name) {
2063 if (name == start)
2064 return start;
2065 name--;
2066 }
2067 return name + 1;
2068}
2069
2070/*
2071 * compare the old list of snapshots that we have to what's in the header
2072 * and update it accordingly. Note that the header holds the snapshots
2073 * in a reverse order (from newest to oldest) and we need to go from
2074 * older to new so that we don't get a duplicate snap name when
2075 * doing the process (e.g., removed snapshot and recreated a new
2076 * one with the same name.
2077 */
2078static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2079{
2080 const char *name, *first_name;
2081 int i = rbd_dev->header.total_snaps;
2082 struct rbd_snap *snap, *old_snap = NULL;
2083 int ret;
2084 struct list_head *p, *n;
2085
2086 first_name = rbd_dev->header.snap_names;
2087 name = first_name + rbd_dev->header.snap_names_len;
2088
2089 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2090 u64 cur_id;
2091
2092 old_snap = list_entry(p, struct rbd_snap, node);
2093
2094 if (i)
2095 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2096
2097 if (!i || old_snap->id < cur_id) {
2098 /* old_snap->id was skipped, thus was removed */
2099 __rbd_remove_snap_dev(rbd_dev, old_snap);
2100 continue;
2101 }
2102 if (old_snap->id == cur_id) {
2103 /* we have this snapshot already */
2104 i--;
2105 name = rbd_prev_snap_name(name, first_name);
2106 continue;
2107 }
2108 for (; i > 0;
2109 i--, name = rbd_prev_snap_name(name, first_name)) {
2110 if (!name) {
2111 WARN_ON(1);
2112 return -EINVAL;
2113 }
2114 cur_id = rbd_dev->header.snapc->snaps[i];
2115 /* snapshot removal? handle it above */
2116 if (cur_id >= old_snap->id)
2117 break;
2118 /* a new snapshot */
2119 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2120 if (ret < 0)
2121 return ret;
2122
2123 /* note that we add it backward so using n and not p */
2124 list_add(&snap->node, n);
2125 p = &snap->node;
2126 }
2127 }
2128 /* we're done going over the old snap list, just add what's left */
2129 for (; i > 0; i--) {
2130 name = rbd_prev_snap_name(name, first_name);
2131 if (!name) {
2132 WARN_ON(1);
2133 return -EINVAL;
2134 }
2135 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2136 if (ret < 0)
2137 return ret;
2138 list_add(&snap->node, &rbd_dev->snaps);
2139 }
2140
2141 return 0;
2142}
2143
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002144static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2145{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002146 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002147 struct device *dev;
2148 struct rbd_snap *snap;
2149
2150 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2151 dev = &rbd_dev->dev;
2152
2153 dev->bus = &rbd_bus_type;
2154 dev->type = &rbd_device_type;
2155 dev->parent = &rbd_root_dev;
2156 dev->release = rbd_dev_release;
2157 dev_set_name(dev, "%d", rbd_dev->id);
2158 ret = device_register(dev);
2159 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002160 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002161
2162 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2163 ret = rbd_register_snap_dev(rbd_dev, snap,
2164 &rbd_dev->dev);
2165 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002166 break;
2167 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002168out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002169 mutex_unlock(&ctl_mutex);
2170 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002171}
2172
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002173static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2174{
2175 device_unregister(&rbd_dev->dev);
2176}
2177
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002178static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2179{
2180 int ret, rc;
2181
2182 do {
Alex Elder0bed54d2012-07-03 16:01:18 -05002183 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002184 rbd_dev->header.obj_version);
2185 if (ret == -ERANGE) {
2186 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08002187 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002188 mutex_unlock(&ctl_mutex);
2189 if (rc < 0)
2190 return rc;
2191 }
2192 } while (ret == -ERANGE);
2193
2194 return ret;
2195}
2196
Alex Elder1ddbe942012-01-29 13:57:44 -06002197static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2198
2199/*
Alex Elder499afd52012-02-02 08:13:29 -06002200 * Get a unique rbd identifier for the given new rbd_dev, and add
2201 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002202 */
Alex Elder499afd52012-02-02 08:13:29 -06002203static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002204{
Alex Elder499afd52012-02-02 08:13:29 -06002205 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2206
2207 spin_lock(&rbd_dev_list_lock);
2208 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2209 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002210}
Alex Elderb7f23c32012-01-29 13:57:43 -06002211
Alex Elder1ddbe942012-01-29 13:57:44 -06002212/*
Alex Elder499afd52012-02-02 08:13:29 -06002213 * Remove an rbd_dev from the global list, and record that its
2214 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002215 */
Alex Elder499afd52012-02-02 08:13:29 -06002216static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002217{
Alex Elderd184f6b2012-01-29 13:57:44 -06002218 struct list_head *tmp;
2219 int rbd_id = rbd_dev->id;
2220 int max_id;
2221
2222 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002223
2224 spin_lock(&rbd_dev_list_lock);
2225 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002226
2227 /*
2228 * If the id being "put" is not the current maximum, there
2229 * is nothing special we need to do.
2230 */
2231 if (rbd_id != atomic64_read(&rbd_id_max)) {
2232 spin_unlock(&rbd_dev_list_lock);
2233 return;
2234 }
2235
2236 /*
2237 * We need to update the current maximum id. Search the
2238 * list to find out what it is. We're more likely to find
2239 * the maximum at the end, so search the list backward.
2240 */
2241 max_id = 0;
2242 list_for_each_prev(tmp, &rbd_dev_list) {
2243 struct rbd_device *rbd_dev;
2244
2245 rbd_dev = list_entry(tmp, struct rbd_device, node);
2246 if (rbd_id > max_id)
2247 max_id = rbd_id;
2248 }
Alex Elder499afd52012-02-02 08:13:29 -06002249 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002250
Alex Elder1ddbe942012-01-29 13:57:44 -06002251 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002252 * The max id could have been updated by rbd_id_get(), in
2253 * which case it now accurately reflects the new maximum.
2254 * Be careful not to overwrite the maximum value in that
2255 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002256 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002257 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002258}
2259
Alex Eldera725f65e2012-02-02 08:13:30 -06002260/*
Alex Eldere28fff262012-02-02 08:13:30 -06002261 * Skips over white space at *buf, and updates *buf to point to the
2262 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002263 * the token (string of non-white space characters) found. Note
2264 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002265 */
2266static inline size_t next_token(const char **buf)
2267{
2268 /*
2269 * These are the characters that produce nonzero for
2270 * isspace() in the "C" and "POSIX" locales.
2271 */
2272 const char *spaces = " \f\n\r\t\v";
2273
2274 *buf += strspn(*buf, spaces); /* Find start of token */
2275
2276 return strcspn(*buf, spaces); /* Return token length */
2277}
2278
2279/*
2280 * Finds the next token in *buf, and if the provided token buffer is
2281 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002282 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2283 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002284 *
2285 * Returns the length of the token found (not including the '\0').
2286 * Return value will be 0 if no token is found, and it will be >=
2287 * token_size if the token would not fit.
2288 *
Alex Elder593a9e72012-02-07 12:03:37 -06002289 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002290 * found token. Note that this occurs even if the token buffer is
2291 * too small to hold it.
2292 */
2293static inline size_t copy_token(const char **buf,
2294 char *token,
2295 size_t token_size)
2296{
2297 size_t len;
2298
2299 len = next_token(buf);
2300 if (len < token_size) {
2301 memcpy(token, *buf, len);
2302 *(token + len) = '\0';
2303 }
2304 *buf += len;
2305
2306 return len;
2307}
2308
2309/*
Alex Elderea3352f2012-07-09 21:04:23 -05002310 * Finds the next token in *buf, dynamically allocates a buffer big
2311 * enough to hold a copy of it, and copies the token into the new
2312 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2313 * that a duplicate buffer is created even for a zero-length token.
2314 *
2315 * Returns a pointer to the newly-allocated duplicate, or a null
2316 * pointer if memory for the duplicate was not available. If
2317 * the lenp argument is a non-null pointer, the length of the token
2318 * (not including the '\0') is returned in *lenp.
2319 *
2320 * If successful, the *buf pointer will be updated to point beyond
2321 * the end of the found token.
2322 *
2323 * Note: uses GFP_KERNEL for allocation.
2324 */
2325static inline char *dup_token(const char **buf, size_t *lenp)
2326{
2327 char *dup;
2328 size_t len;
2329
2330 len = next_token(buf);
2331 dup = kmalloc(len + 1, GFP_KERNEL);
2332 if (!dup)
2333 return NULL;
2334
2335 memcpy(dup, *buf, len);
2336 *(dup + len) = '\0';
2337 *buf += len;
2338
2339 if (lenp)
2340 *lenp = len;
2341
2342 return dup;
2343}
2344
2345/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002346 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002347 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2348 * on the list of monitor addresses and other options provided via
2349 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002350 *
2351 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002352 */
2353static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2354 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002355 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002356 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002357 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002358 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002359{
Alex Elderd22f76e2012-07-12 10:46:35 -05002360 size_t len;
2361 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002362
2363 /* The first four tokens are required */
2364
Alex Elder7ef32142012-02-02 08:13:30 -06002365 len = next_token(&buf);
2366 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002367 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002368 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002369 *mon_addrs = buf;
2370
2371 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002372
Alex Eldere28fff262012-02-02 08:13:30 -06002373 len = copy_token(&buf, options, options_size);
2374 if (!len || len >= options_size)
2375 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002376
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002377 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002378 rbd_dev->pool_name = dup_token(&buf, NULL);
2379 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002380 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002381
Alex Elder0bed54d2012-07-03 16:01:18 -05002382 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2383 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002384 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002385
Alex Eldercb8627c2012-07-09 21:04:23 -05002386 /* Create the name of the header object */
2387
Alex Elder0bed54d2012-07-03 16:01:18 -05002388 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002389 + sizeof (RBD_SUFFIX),
2390 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002391 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002392 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002393 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002394
Alex Eldere28fff262012-02-02 08:13:30 -06002395 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002396 * The snapshot name is optional. If none is is supplied,
2397 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002398 */
Alex Elder820a5f32012-07-09 21:04:24 -05002399 rbd_dev->snap_name = dup_token(&buf, &len);
2400 if (!rbd_dev->snap_name)
2401 goto out_err;
2402 if (!len) {
2403 /* Replace the empty name with the default */
2404 kfree(rbd_dev->snap_name);
2405 rbd_dev->snap_name
2406 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2407 if (!rbd_dev->snap_name)
2408 goto out_err;
2409
Alex Eldere28fff262012-02-02 08:13:30 -06002410 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2411 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002412 }
Alex Eldere28fff262012-02-02 08:13:30 -06002413
Alex Eldera725f65e2012-02-02 08:13:30 -06002414 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002415
2416out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002417 kfree(rbd_dev->header_name);
2418 kfree(rbd_dev->image_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002419 kfree(rbd_dev->pool_name);
2420 rbd_dev->pool_name = NULL;
2421
2422 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002423}
2424
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002425static ssize_t rbd_add(struct bus_type *bus,
2426 const char *buf,
2427 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002428{
Alex Eldercb8627c2012-07-09 21:04:23 -05002429 char *options;
2430 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002431 const char *mon_addrs = NULL;
2432 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002433 struct ceph_osd_client *osdc;
2434 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002435
2436 if (!try_module_get(THIS_MODULE))
2437 return -ENODEV;
2438
Alex Elder27cc2592012-02-02 08:13:30 -06002439 options = kmalloc(count, GFP_KERNEL);
2440 if (!options)
2441 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002442 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2443 if (!rbd_dev)
2444 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002445
2446 /* static rbd_device initialization */
2447 spin_lock_init(&rbd_dev->lock);
2448 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002449 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002450 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002451
Josh Durginc6666012011-11-21 17:11:12 -08002452 init_rwsem(&rbd_dev->header_rwsem);
Alex Elder0e805a12012-01-11 19:42:15 -08002453
Alex Elderd184f6b2012-01-29 13:57:44 -06002454 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002455 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002456
Alex Eldera725f65e2012-02-02 08:13:30 -06002457 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002458 BUILD_BUG_ON(DEV_NAME_LEN
2459 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2460 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
Alex Eldere124a82f2012-01-29 13:57:44 -06002461
Alex Eldera725f65e2012-02-02 08:13:30 -06002462 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002463 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002464 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002465 if (rc)
2466 goto err_put_id;
2467
Alex Elder5214ecc2012-02-02 08:13:30 -06002468 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2469 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002470 if (IS_ERR(rbd_dev->rbd_client)) {
2471 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002472 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002473 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002474
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002475 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002476 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002477 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2478 if (rc < 0)
2479 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002480 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002481
2482 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002483 rc = register_blkdev(0, rbd_dev->name);
2484 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002485 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002486 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002487
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002488 rc = rbd_bus_add_dev(rbd_dev);
2489 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002490 goto err_out_blkdev;
2491
Alex Elder32eec682012-02-08 16:11:14 -06002492 /*
2493 * At this point cleanup in the event of an error is the job
2494 * of the sysfs code (initiated by rbd_bus_del_dev()).
2495 *
2496 * Set up and announce blkdev mapping.
2497 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002498 rc = rbd_init_disk(rbd_dev);
2499 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002500 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002501
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002502 rc = rbd_init_watch_dev(rbd_dev);
2503 if (rc)
2504 goto err_out_bus;
2505
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002506 return count;
2507
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002508err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002509 /* this will also clean up rest of rbd_dev stuff */
2510
2511 rbd_bus_del_dev(rbd_dev);
2512 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002513 return rc;
2514
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002515err_out_blkdev:
2516 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2517err_out_client:
2518 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002519err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002520 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002521 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002522 kfree(rbd_dev->header_name);
2523 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002524 kfree(rbd_dev->pool_name);
2525 }
Alex Elder499afd52012-02-02 08:13:29 -06002526 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002527err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002528 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002529 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002530
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002531 dout("Error adding device %s\n", buf);
2532 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002533
2534 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002535}
2536
2537static struct rbd_device *__rbd_get_dev(unsigned long id)
2538{
2539 struct list_head *tmp;
2540 struct rbd_device *rbd_dev;
2541
Alex Eldere124a82f2012-01-29 13:57:44 -06002542 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002543 list_for_each(tmp, &rbd_dev_list) {
2544 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a82f2012-01-29 13:57:44 -06002545 if (rbd_dev->id == id) {
2546 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002547 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002548 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002549 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002550 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002551 return NULL;
2552}
2553
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002554static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002555{
Alex Elder593a9e72012-02-07 12:03:37 -06002556 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002557
Alex Elder1dbb4392012-01-24 10:08:37 -06002558 if (rbd_dev->watch_request) {
2559 struct ceph_client *client = rbd_dev->rbd_client->client;
2560
2561 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002562 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002563 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002564 if (rbd_dev->watch_event)
Alex Elder0bed54d2012-07-03 16:01:18 -05002565 rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002566
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002567 rbd_put_client(rbd_dev);
2568
2569 /* clean up and free blkdev */
2570 rbd_free_disk(rbd_dev);
2571 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002572
2573 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002574 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002575 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002576 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002577 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002578 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002579 kfree(rbd_dev);
2580
2581 /* release module ref */
2582 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002583}
2584
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002585static ssize_t rbd_remove(struct bus_type *bus,
2586 const char *buf,
2587 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002588{
2589 struct rbd_device *rbd_dev = NULL;
2590 int target_id, rc;
2591 unsigned long ul;
2592 int ret = count;
2593
2594 rc = strict_strtoul(buf, 10, &ul);
2595 if (rc)
2596 return rc;
2597
2598 /* convert to int; abort if we lost anything in the conversion */
2599 target_id = (int) ul;
2600 if (target_id != ul)
2601 return -EINVAL;
2602
2603 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2604
2605 rbd_dev = __rbd_get_dev(target_id);
2606 if (!rbd_dev) {
2607 ret = -ENOENT;
2608 goto done;
2609 }
2610
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002611 __rbd_remove_all_snaps(rbd_dev);
2612 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002613
2614done:
2615 mutex_unlock(&ctl_mutex);
2616 return ret;
2617}
2618
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002619static ssize_t rbd_snap_add(struct device *dev,
2620 struct device_attribute *attr,
2621 const char *buf,
2622 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002623{
Alex Elder593a9e72012-02-07 12:03:37 -06002624 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002625 int ret;
2626 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002627 if (!name)
2628 return -ENOMEM;
2629
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002630 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002631
2632 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2633
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002634 ret = rbd_header_add_snap(rbd_dev,
2635 name, GFP_KERNEL);
2636 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002637 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002638
Josh Durgin263c6ca2011-12-05 10:43:42 -08002639 ret = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002640 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002641 goto err_unlock;
2642
2643 /* shouldn't hold ctl_mutex when notifying.. notify might
2644 trigger a watch callback that would need to get that mutex */
2645 mutex_unlock(&ctl_mutex);
2646
2647 /* make a best effort, don't error if failed */
Alex Elder0bed54d2012-07-03 16:01:18 -05002648 rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002649
2650 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002651 kfree(name);
2652 return ret;
2653
2654err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002655 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002656 kfree(name);
2657 return ret;
2658}
2659
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002660/*
2661 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002662 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002663 */
2664static int rbd_sysfs_init(void)
2665{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002666 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002667
Alex Elderfed4c142012-02-07 12:03:36 -06002668 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002669 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002670 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002671
Alex Elderfed4c142012-02-07 12:03:36 -06002672 ret = bus_register(&rbd_bus_type);
2673 if (ret < 0)
2674 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002675
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002676 return ret;
2677}
2678
2679static void rbd_sysfs_cleanup(void)
2680{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002681 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002682 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002683}
2684
2685int __init rbd_init(void)
2686{
2687 int rc;
2688
2689 rc = rbd_sysfs_init();
2690 if (rc)
2691 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002692 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002693 return 0;
2694}
2695
2696void __exit rbd_exit(void)
2697{
2698 rbd_sysfs_cleanup();
2699}
2700
2701module_init(rbd_init);
2702module_exit(rbd_exit);
2703
2704MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2705MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2706MODULE_DESCRIPTION("rados block device");
2707
2708/* following authorship retained from original osdblk.c */
2709MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2710
2711MODULE_LICENSE("GPL");