blob: 7a87a8c3fa34220db43b0822ce9b2f3392f6087f [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
85 u64 snap_seq;
86 u32 total_snaps;
87
88 char *snap_names;
89 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090
91 u64 obj_version;
92};
93
94struct rbd_options {
95 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096};
97
98/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060099 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700100 */
101struct rbd_client {
102 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700103 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104 struct kref kref;
105 struct list_head node;
106};
107
108/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600109 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700110 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700111struct rbd_req_status {
112 int done;
113 int rc;
114 u64 bytes;
115};
116
117/*
118 * a collection of requests
119 */
120struct rbd_req_coll {
121 int total;
122 int num_done;
123 struct kref kref;
124 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700125};
126
Alex Elderf0f8cef2012-01-29 13:57:44 -0600127/*
128 * a single io request
129 */
130struct rbd_request {
131 struct request *rq; /* blk layer request */
132 struct bio *bio; /* cloned bio */
133 struct page **pages; /* list of used pages */
134 u64 len;
135 int coll_index;
136 struct rbd_req_coll *coll;
137};
138
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800139struct rbd_snap {
140 struct device dev;
141 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800142 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800143 struct list_head node;
144 u64 id;
145};
146
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700147/*
148 * a single device
149 */
150struct rbd_device {
151 int id; /* blkdev unique id */
152
153 int major; /* blkdev assigned major */
154 struct gendisk *disk; /* blkdev's gendisk and rq */
155 struct request_queue *q;
156
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157 struct rbd_client *rbd_client;
158
159 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
160
161 spinlock_t lock; /* queue lock */
162
163 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500164 char *image_name;
165 size_t image_name_len;
166 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500167 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500168 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700170 struct ceph_osd_event *watch_event;
171 struct ceph_osd_request *watch_request;
172
Josh Durginc6666012011-11-21 17:11:12 -0800173 /* protects updating the header */
174 struct rw_semaphore header_rwsem;
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgin77dfe992011-11-21 13:04:42 -0800176 u64 snap_id; /* current snapshot id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700177 int read_only;
178
179 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800180
181 /* list of snapshots */
182 struct list_head snaps;
183
184 /* sysfs related */
185 struct device dev;
186};
187
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600189
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700190static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600191static DEFINE_SPINLOCK(rbd_dev_list_lock);
192
Alex Elder432b8582012-01-29 13:57:44 -0600193static LIST_HEAD(rbd_client_list); /* clients */
194static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700195
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800196static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
197static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800198static ssize_t rbd_snap_add(struct device *dev,
199 struct device_attribute *attr,
200 const char *buf,
201 size_t count);
202static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700203 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800204
Alex Elderf0f8cef2012-01-29 13:57:44 -0600205static ssize_t rbd_add(struct bus_type *bus, const char *buf,
206 size_t count);
207static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
208 size_t count);
209
210static struct bus_attribute rbd_bus_attrs[] = {
211 __ATTR(add, S_IWUSR, NULL, rbd_add),
212 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
213 __ATTR_NULL
214};
215
216static struct bus_type rbd_bus_type = {
217 .name = "rbd",
218 .bus_attrs = rbd_bus_attrs,
219};
220
221static void rbd_root_dev_release(struct device *dev)
222{
223}
224
225static struct device rbd_root_dev = {
226 .init_name = "rbd",
227 .release = rbd_root_dev_release,
228};
229
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
232{
233 return get_device(&rbd_dev->dev);
234}
235
236static void rbd_put_dev(struct rbd_device *rbd_dev)
237{
238 put_device(&rbd_dev->dev);
239}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700240
Josh Durgin263c6ca2011-12-05 10:43:42 -0800241static int __rbd_refresh_header(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700242
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700243static int rbd_open(struct block_device *bdev, fmode_t mode)
244{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600245 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700246
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800247 rbd_get_dev(rbd_dev);
248
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249 set_device_ro(bdev, rbd_dev->read_only);
250
251 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
252 return -EROFS;
253
254 return 0;
255}
256
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800257static int rbd_release(struct gendisk *disk, fmode_t mode)
258{
259 struct rbd_device *rbd_dev = disk->private_data;
260
261 rbd_put_dev(rbd_dev);
262
263 return 0;
264}
265
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700266static const struct block_device_operations rbd_bd_ops = {
267 .owner = THIS_MODULE,
268 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800269 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270};
271
272/*
273 * Initialize an rbd client instance.
274 * We own *opt.
275 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700276static struct rbd_client *rbd_client_create(struct ceph_options *opt,
277 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700278{
279 struct rbd_client *rbdc;
280 int ret = -ENOMEM;
281
282 dout("rbd_client_create\n");
283 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
284 if (!rbdc)
285 goto out_opt;
286
287 kref_init(&rbdc->kref);
288 INIT_LIST_HEAD(&rbdc->node);
289
Alex Elderbc534d82012-01-29 13:57:44 -0600290 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
291
Sage Weil6ab00d42011-08-09 09:41:59 -0700292 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600294 goto out_mutex;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400295 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296
297 ret = ceph_open_session(rbdc->client);
298 if (ret < 0)
299 goto out_err;
300
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700301 rbdc->rbd_opts = rbd_opts;
302
Alex Elder432b8582012-01-29 13:57:44 -0600303 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600305 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306
Alex Elderbc534d82012-01-29 13:57:44 -0600307 mutex_unlock(&ctl_mutex);
308
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309 dout("rbd_client_create created %p\n", rbdc);
310 return rbdc;
311
312out_err:
313 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600314out_mutex:
315 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316 kfree(rbdc);
317out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400318 if (opt)
319 ceph_destroy_options(opt);
320 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700321}
322
323/*
324 * Find a ceph client with specific addr and configuration.
325 */
326static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
327{
328 struct rbd_client *client_node;
329
330 if (opt->flags & CEPH_OPT_NOSHARE)
331 return NULL;
332
333 list_for_each_entry(client_node, &rbd_client_list, node)
334 if (ceph_compare_options(opt, client_node->client) == 0)
335 return client_node;
336 return NULL;
337}
338
339/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700340 * mount options
341 */
342enum {
343 Opt_notify_timeout,
344 Opt_last_int,
345 /* int args above */
346 Opt_last_string,
347 /* string args above */
348};
349
350static match_table_t rbdopt_tokens = {
351 {Opt_notify_timeout, "notify_timeout=%d"},
352 /* int args above */
353 /* string args above */
354 {-1, NULL}
355};
356
357static int parse_rbd_opts_token(char *c, void *private)
358{
359 struct rbd_options *rbdopt = private;
360 substring_t argstr[MAX_OPT_ARGS];
361 int token, intval, ret;
362
Alex Elder21079782012-01-24 10:08:36 -0600363 token = match_token(c, rbdopt_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700364 if (token < 0)
365 return -EINVAL;
366
367 if (token < Opt_last_int) {
368 ret = match_int(&argstr[0], &intval);
369 if (ret < 0) {
370 pr_err("bad mount option arg (not int) "
371 "at '%s'\n", c);
372 return ret;
373 }
374 dout("got int token %d val %d\n", token, intval);
375 } else if (token > Opt_last_int && token < Opt_last_string) {
376 dout("got string token %d val %s\n", token,
377 argstr[0].from);
378 } else {
379 dout("got token %d\n", token);
380 }
381
382 switch (token) {
383 case Opt_notify_timeout:
384 rbdopt->notify_timeout = intval;
385 break;
386 default:
387 BUG_ON(token);
388 }
389 return 0;
390}
391
392/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700393 * Get a ceph client with specific addr and configuration, if one does
394 * not exist create it.
395 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600396static struct rbd_client *rbd_get_client(const char *mon_addr,
397 size_t mon_addr_len,
398 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700399{
400 struct rbd_client *rbdc;
401 struct ceph_options *opt;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700402 struct rbd_options *rbd_opts;
403
404 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
405 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600406 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700407
408 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700409
Alex Elderee577412012-01-24 10:08:36 -0600410 opt = ceph_parse_options(options, mon_addr,
Alex Elder5214ecc2012-02-02 08:13:30 -0600411 mon_addr + mon_addr_len,
Alex Elder21079782012-01-24 10:08:36 -0600412 parse_rbd_opts_token, rbd_opts);
Alex Elderee577412012-01-24 10:08:36 -0600413 if (IS_ERR(opt)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600414 kfree(rbd_opts);
415 return ERR_CAST(opt);
Alex Elderee577412012-01-24 10:08:36 -0600416 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417
Alex Elder432b8582012-01-29 13:57:44 -0600418 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419 rbdc = __rbd_client_find(opt);
420 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600421 /* using an existing client */
422 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600423 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600424
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700425 ceph_destroy_options(opt);
Alex Elder97bb59a2012-01-24 10:08:36 -0600426 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700427
Alex Elderd720bcb2012-02-02 08:13:30 -0600428 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700429 }
Alex Elder432b8582012-01-29 13:57:44 -0600430 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700432 rbdc = rbd_client_create(opt, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600433
Alex Elderd720bcb2012-02-02 08:13:30 -0600434 if (IS_ERR(rbdc))
435 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436
Alex Elderd720bcb2012-02-02 08:13:30 -0600437 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438}
439
440/*
441 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600442 *
Alex Elder432b8582012-01-29 13:57:44 -0600443 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444 */
445static void rbd_client_release(struct kref *kref)
446{
447 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
448
449 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500450 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500452 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453
454 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700455 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456 kfree(rbdc);
457}
458
459/*
460 * Drop reference to ceph client node. If it's not referenced anymore, release
461 * it.
462 */
463static void rbd_put_client(struct rbd_device *rbd_dev)
464{
465 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
466 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467}
468
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700469/*
470 * Destroy requests collection
471 */
472static void rbd_coll_release(struct kref *kref)
473{
474 struct rbd_req_coll *coll =
475 container_of(kref, struct rbd_req_coll, kref);
476
477 dout("rbd_coll_release %p\n", coll);
478 kfree(coll);
479}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700480
481/*
482 * Create a new header structure, translate header format from the on-disk
483 * header.
484 */
485static int rbd_header_from_disk(struct rbd_image_header *header,
486 struct rbd_image_header_ondisk *ondisk,
Xi Wang50f7c4c2012-04-20 15:49:44 -0500487 u32 allocated_snaps,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488 gfp_t gfp_flags)
489{
Xi Wang50f7c4c2012-04-20 15:49:44 -0500490 u32 i, snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700491
Alex Elder21079782012-01-24 10:08:36 -0600492 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800493 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800494
Alex Elder00f1f362012-02-07 12:03:36 -0600495 snap_count = le32_to_cpu(ondisk->snap_count);
Xi Wang50f7c4c2012-04-20 15:49:44 -0500496 if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
497 / sizeof (*ondisk))
498 return -EINVAL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700499 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Yan, Zhengf9f9a192012-06-06 09:15:33 -0500500 snap_count * sizeof(u64),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700501 gfp_flags);
502 if (!header->snapc)
503 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600504
Alex Elder00f1f362012-02-07 12:03:36 -0600505 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700506 if (snap_count) {
507 header->snap_names = kmalloc(header->snap_names_len,
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500508 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700509 if (!header->snap_names)
510 goto err_snapc;
511 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500512 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700513 if (!header->snap_sizes)
514 goto err_names;
515 } else {
516 header->snap_names = NULL;
517 header->snap_sizes = NULL;
518 }
Alex Elder849b4262012-07-09 21:04:24 -0500519
520 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
521 gfp_flags);
522 if (!header->object_prefix)
523 goto err_sizes;
524
Alex Elderca1e49a2012-07-10 20:30:09 -0500525 memcpy(header->object_prefix, ondisk->block_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700526 sizeof(ondisk->block_name));
Alex Elder849b4262012-07-09 21:04:24 -0500527 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700528
529 header->image_size = le64_to_cpu(ondisk->image_size);
530 header->obj_order = ondisk->options.order;
531 header->crypt_type = ondisk->options.crypt_type;
532 header->comp_type = ondisk->options.comp_type;
533
534 atomic_set(&header->snapc->nref, 1);
535 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
536 header->snapc->num_snaps = snap_count;
537 header->total_snaps = snap_count;
538
Alex Elder21079782012-01-24 10:08:36 -0600539 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540 for (i = 0; i < snap_count; i++) {
541 header->snapc->snaps[i] =
542 le64_to_cpu(ondisk->snaps[i].id);
543 header->snap_sizes[i] =
544 le64_to_cpu(ondisk->snaps[i].image_size);
545 }
546
547 /* copy snapshot names */
548 memcpy(header->snap_names, &ondisk->snaps[i],
549 header->snap_names_len);
550 }
551
552 return 0;
553
Alex Elder849b4262012-07-09 21:04:24 -0500554err_sizes:
555 kfree(header->snap_sizes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700556err_names:
557 kfree(header->snap_names);
558err_snapc:
559 kfree(header->snapc);
Alex Elder00f1f362012-02-07 12:03:36 -0600560 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700561}
562
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700563static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
564 u64 *seq, u64 *size)
565{
566 int i;
567 char *p = header->snap_names;
568
Alex Elder00f1f362012-02-07 12:03:36 -0600569 for (i = 0; i < header->total_snaps; i++) {
570 if (!strcmp(snap_name, p)) {
571
572 /* Found it. Pass back its id and/or size */
573
574 if (seq)
575 *seq = header->snapc->snaps[i];
576 if (size)
577 *size = header->snap_sizes[i];
578 return i;
579 }
580 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700581 }
Alex Elder00f1f362012-02-07 12:03:36 -0600582 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700583}
584
Alex Elder0ce1a792012-07-03 16:01:18 -0500585static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586{
Alex Elder0ce1a792012-07-03 16:01:18 -0500587 struct rbd_image_header *header = &rbd_dev->header;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700588 struct ceph_snap_context *snapc = header->snapc;
589 int ret = -ENOENT;
590
Alex Elder0ce1a792012-07-03 16:01:18 -0500591 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592
Alex Elder0ce1a792012-07-03 16:01:18 -0500593 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800594 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700595 if (header->total_snaps)
596 snapc->seq = header->snap_seq;
597 else
598 snapc->seq = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -0500599 rbd_dev->snap_id = CEPH_NOSNAP;
600 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700601 if (size)
602 *size = header->image_size;
603 } else {
Alex Elder0ce1a792012-07-03 16:01:18 -0500604 ret = snap_by_name(header, rbd_dev->snap_name,
605 &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606 if (ret < 0)
607 goto done;
Alex Elder0ce1a792012-07-03 16:01:18 -0500608 rbd_dev->snap_id = snapc->seq;
609 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610 }
611
612 ret = 0;
613done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500614 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700615 return ret;
616}
617
618static void rbd_header_free(struct rbd_image_header *header)
619{
Alex Elder849b4262012-07-09 21:04:24 -0500620 kfree(header->object_prefix);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621 kfree(header->snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -0500622 kfree(header->snap_names);
623 kfree(header->snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624}
625
626/*
627 * get the actual striped segment name, offset and length
628 */
629static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500630 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631 u64 ofs, u64 len,
632 char *seg_name, u64 *segofs)
633{
634 u64 seg = ofs >> header->obj_order;
635
636 if (seg_name)
637 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500638 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639
640 ofs = ofs & ((1 << header->obj_order) - 1);
641 len = min_t(u64, len, (1 << header->obj_order) - ofs);
642
643 if (segofs)
644 *segofs = ofs;
645
646 return len;
647}
648
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700649static int rbd_get_num_segments(struct rbd_image_header *header,
650 u64 ofs, u64 len)
651{
652 u64 start_seg = ofs >> header->obj_order;
653 u64 end_seg = (ofs + len - 1) >> header->obj_order;
654 return end_seg - start_seg + 1;
655}
656
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700658 * returns the size of an object in the image
659 */
660static u64 rbd_obj_bytes(struct rbd_image_header *header)
661{
662 return 1 << header->obj_order;
663}
664
665/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 * bio helpers
667 */
668
669static void bio_chain_put(struct bio *chain)
670{
671 struct bio *tmp;
672
673 while (chain) {
674 tmp = chain;
675 chain = chain->bi_next;
676 bio_put(tmp);
677 }
678}
679
680/*
681 * zeros a bio chain, starting at specific offset
682 */
683static void zero_bio_chain(struct bio *chain, int start_ofs)
684{
685 struct bio_vec *bv;
686 unsigned long flags;
687 void *buf;
688 int i;
689 int pos = 0;
690
691 while (chain) {
692 bio_for_each_segment(bv, chain, i) {
693 if (pos + bv->bv_len > start_ofs) {
694 int remainder = max(start_ofs - pos, 0);
695 buf = bvec_kmap_irq(bv, &flags);
696 memset(buf + remainder, 0,
697 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200698 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 }
700 pos += bv->bv_len;
701 }
702
703 chain = chain->bi_next;
704 }
705}
706
707/*
708 * bio_chain_clone - clone a chain of bios up to a certain length.
709 * might return a bio_pair that will need to be released.
710 */
711static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
712 struct bio_pair **bp,
713 int len, gfp_t gfpmask)
714{
715 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
716 int total = 0;
717
718 if (*bp) {
719 bio_pair_release(*bp);
720 *bp = NULL;
721 }
722
723 while (old_chain && (total < len)) {
724 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
725 if (!tmp)
726 goto err_out;
727
728 if (total + old_chain->bi_size > len) {
729 struct bio_pair *bp;
730
731 /*
732 * this split can only happen with a single paged bio,
733 * split_bio will BUG_ON if this is not the case
734 */
735 dout("bio_chain_clone split! total=%d remaining=%d"
736 "bi_size=%d\n",
737 (int)total, (int)len-total,
738 (int)old_chain->bi_size);
739
740 /* split the bio. We'll release it either in the next
741 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600742 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743 if (!bp)
744 goto err_out;
745
746 __bio_clone(tmp, &bp->bio1);
747
748 *next = &bp->bio2;
749 } else {
750 __bio_clone(tmp, old_chain);
751 *next = old_chain->bi_next;
752 }
753
754 tmp->bi_bdev = NULL;
755 gfpmask &= ~__GFP_WAIT;
756 tmp->bi_next = NULL;
757
758 if (!new_chain) {
759 new_chain = tail = tmp;
760 } else {
761 tail->bi_next = tmp;
762 tail = tmp;
763 }
764 old_chain = old_chain->bi_next;
765
766 total += tmp->bi_size;
767 }
768
769 BUG_ON(total < len);
770
771 if (tail)
772 tail->bi_next = NULL;
773
774 *old = old_chain;
775
776 return new_chain;
777
778err_out:
779 dout("bio_chain_clone with err\n");
780 bio_chain_put(new_chain);
781 return NULL;
782}
783
784/*
785 * helpers for osd request op vectors.
786 */
787static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
788 int num_ops,
789 int opcode,
790 u32 payload_len)
791{
792 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
793 GFP_NOIO);
794 if (!*ops)
795 return -ENOMEM;
796 (*ops)[0].op = opcode;
797 /*
798 * op extent offset and length will be set later on
799 * in calc_raw_layout()
800 */
801 (*ops)[0].payload_len = payload_len;
802 return 0;
803}
804
805static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
806{
807 kfree(ops);
808}
809
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700810static void rbd_coll_end_req_index(struct request *rq,
811 struct rbd_req_coll *coll,
812 int index,
813 int ret, u64 len)
814{
815 struct request_queue *q;
816 int min, max, i;
817
818 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
819 coll, index, ret, len);
820
821 if (!rq)
822 return;
823
824 if (!coll) {
825 blk_end_request(rq, ret, len);
826 return;
827 }
828
829 q = rq->q;
830
831 spin_lock_irq(q->queue_lock);
832 coll->status[index].done = 1;
833 coll->status[index].rc = ret;
834 coll->status[index].bytes = len;
835 max = min = coll->num_done;
836 while (max < coll->total && coll->status[max].done)
837 max++;
838
839 for (i = min; i<max; i++) {
840 __blk_end_request(rq, coll->status[i].rc,
841 coll->status[i].bytes);
842 coll->num_done++;
843 kref_put(&coll->kref, rbd_coll_release);
844 }
845 spin_unlock_irq(q->queue_lock);
846}
847
848static void rbd_coll_end_req(struct rbd_request *req,
849 int ret, u64 len)
850{
851 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
852}
853
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854/*
855 * Send ceph osd request
856 */
857static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500858 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859 struct ceph_snap_context *snapc,
860 u64 snapid,
861 const char *obj, u64 ofs, u64 len,
862 struct bio *bio,
863 struct page **pages,
864 int num_pages,
865 int flags,
866 struct ceph_osd_req_op *ops,
867 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700868 struct rbd_req_coll *coll,
869 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700870 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700871 struct ceph_msg *msg),
872 struct ceph_osd_request **linger_req,
873 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874{
875 struct ceph_osd_request *req;
876 struct ceph_file_layout *layout;
877 int ret;
878 u64 bno;
879 struct timespec mtime = CURRENT_TIME;
880 struct rbd_request *req_data;
881 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600882 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700883
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700885 if (!req_data) {
886 if (coll)
887 rbd_coll_end_req_index(rq, coll, coll_index,
888 -ENOMEM, len);
889 return -ENOMEM;
890 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700891
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700892 if (coll) {
893 req_data->coll = coll;
894 req_data->coll_index = coll_index;
895 }
896
897 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898
Alex Elder0ce1a792012-07-03 16:01:18 -0500899 down_read(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700900
Alex Elder0ce1a792012-07-03 16:01:18 -0500901 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600902 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
903 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700904 if (!req) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500905 up_read(&rbd_dev->header_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700906 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700907 goto done_pages;
908 }
909
910 req->r_callback = rbd_cb;
911
912 req_data->rq = rq;
913 req_data->bio = bio;
914 req_data->pages = pages;
915 req_data->len = len;
916
917 req->r_priv = req_data;
918
919 reqhead = req->r_request->front.iov_base;
920 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
921
922 strncpy(req->r_oid, obj, sizeof(req->r_oid));
923 req->r_oid_len = strlen(req->r_oid);
924
925 layout = &req->r_file_layout;
926 memset(layout, 0, sizeof(*layout));
927 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
928 layout->fl_stripe_count = cpu_to_le32(1);
929 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500930 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600931 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
932 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700933
934 ceph_osdc_build_request(req, ofs, &len,
935 ops,
936 snapc,
937 &mtime,
938 req->r_oid, req->r_oid_len);
Alex Elder0ce1a792012-07-03 16:01:18 -0500939 up_read(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700940
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700941 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600942 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700943 *linger_req = req;
944 }
945
Alex Elder1dbb4392012-01-24 10:08:37 -0600946 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947 if (ret < 0)
948 goto done_err;
949
950 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600951 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700952 if (ver)
953 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700954 dout("reassert_ver=%lld\n",
955 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956 ceph_osdc_put_request(req);
957 }
958 return ret;
959
960done_err:
961 bio_chain_put(req_data->bio);
962 ceph_osdc_put_request(req);
963done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700964 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700965 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966 return ret;
967}
968
969/*
970 * Ceph osd op callback
971 */
972static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
973{
974 struct rbd_request *req_data = req->r_priv;
975 struct ceph_osd_reply_head *replyhead;
976 struct ceph_osd_op *op;
977 __s32 rc;
978 u64 bytes;
979 int read_op;
980
981 /* parse reply */
982 replyhead = msg->front.iov_base;
983 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
984 op = (void *)(replyhead + 1);
985 rc = le32_to_cpu(replyhead->result);
986 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -0500987 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700988
989 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
990
991 if (rc == -ENOENT && read_op) {
992 zero_bio_chain(req_data->bio, 0);
993 rc = 0;
994 } else if (rc == 0 && read_op && bytes < req_data->len) {
995 zero_bio_chain(req_data->bio, bytes);
996 bytes = req_data->len;
997 }
998
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700999 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000
1001 if (req_data->bio)
1002 bio_chain_put(req_data->bio);
1003
1004 ceph_osdc_put_request(req);
1005 kfree(req_data);
1006}
1007
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001008static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1009{
1010 ceph_osdc_put_request(req);
1011}
1012
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001013/*
1014 * Do a synchronous ceph osd operation
1015 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001016static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017 struct ceph_snap_context *snapc,
1018 u64 snapid,
1019 int opcode,
1020 int flags,
1021 struct ceph_osd_req_op *orig_ops,
1022 int num_reply,
1023 const char *obj,
1024 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001025 char *buf,
1026 struct ceph_osd_request **linger_req,
1027 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028{
1029 int ret;
1030 struct page **pages;
1031 int num_pages;
1032 struct ceph_osd_req_op *ops = orig_ops;
1033 u32 payload_len;
1034
1035 num_pages = calc_pages_for(ofs , len);
1036 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001037 if (IS_ERR(pages))
1038 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001039
1040 if (!orig_ops) {
1041 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1042 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1043 if (ret < 0)
1044 goto done;
1045
1046 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1047 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1048 if (ret < 0)
1049 goto done_ops;
1050 }
1051 }
1052
Alex Elder0ce1a792012-07-03 16:01:18 -05001053 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054 obj, ofs, len, NULL,
1055 pages, num_pages,
1056 flags,
1057 ops,
1058 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001059 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001060 NULL,
1061 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001062 if (ret < 0)
1063 goto done_ops;
1064
1065 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1066 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1067
1068done_ops:
1069 if (!orig_ops)
1070 rbd_destroy_ops(ops);
1071done:
1072 ceph_release_page_vector(pages, num_pages);
1073 return ret;
1074}
1075
1076/*
1077 * Do an asynchronous ceph osd operation
1078 */
1079static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001080 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001081 struct ceph_snap_context *snapc,
1082 u64 snapid,
1083 int opcode, int flags, int num_reply,
1084 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001085 struct bio *bio,
1086 struct rbd_req_coll *coll,
1087 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088{
1089 char *seg_name;
1090 u64 seg_ofs;
1091 u64 seg_len;
1092 int ret;
1093 struct ceph_osd_req_op *ops;
1094 u32 payload_len;
1095
1096 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1097 if (!seg_name)
1098 return -ENOMEM;
1099
1100 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001101 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 ofs, len,
1103 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104
1105 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1106
1107 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1108 if (ret < 0)
1109 goto done;
1110
1111 /* we've taken care of segment sizes earlier when we
1112 cloned the bios. We should never have a segment
1113 truncated at this point */
1114 BUG_ON(seg_len < len);
1115
1116 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1117 seg_name, seg_ofs, seg_len,
1118 bio,
1119 NULL, 0,
1120 flags,
1121 ops,
1122 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001123 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001124 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001125
1126 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001127done:
1128 kfree(seg_name);
1129 return ret;
1130}
1131
1132/*
1133 * Request async osd write
1134 */
1135static int rbd_req_write(struct request *rq,
1136 struct rbd_device *rbd_dev,
1137 struct ceph_snap_context *snapc,
1138 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001139 struct bio *bio,
1140 struct rbd_req_coll *coll,
1141 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001142{
1143 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1144 CEPH_OSD_OP_WRITE,
1145 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1146 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001147 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148}
1149
1150/*
1151 * Request async osd read
1152 */
1153static int rbd_req_read(struct request *rq,
1154 struct rbd_device *rbd_dev,
1155 u64 snapid,
1156 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001157 struct bio *bio,
1158 struct rbd_req_coll *coll,
1159 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001160{
1161 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001162 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001163 CEPH_OSD_OP_READ,
1164 CEPH_OSD_FLAG_READ,
1165 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001166 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167}
1168
1169/*
1170 * Request sync osd read
1171 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001172static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173 struct ceph_snap_context *snapc,
1174 u64 snapid,
1175 const char *obj,
1176 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001177 char *buf,
1178 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179{
Alex Elder0ce1a792012-07-03 16:01:18 -05001180 return rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001181 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001182 CEPH_OSD_OP_READ,
1183 CEPH_OSD_FLAG_READ,
1184 NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001185 1, obj, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001186}
1187
1188/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001189 * Request sync osd watch
1190 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001191static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001192 u64 ver,
1193 u64 notify_id,
1194 const char *obj)
1195{
1196 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001197 int ret;
1198
1199 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001200 if (ret < 0)
1201 return ret;
1202
Alex Elder0ce1a792012-07-03 16:01:18 -05001203 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001204 ops[0].watch.cookie = notify_id;
1205 ops[0].watch.flag = 0;
1206
Alex Elder0ce1a792012-07-03 16:01:18 -05001207 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001208 obj, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001209 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001210 CEPH_OSD_FLAG_READ,
1211 ops,
1212 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001213 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001214 rbd_simple_req_cb, 0, NULL);
1215
1216 rbd_destroy_ops(ops);
1217 return ret;
1218}
1219
1220static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1221{
Alex Elder0ce1a792012-07-03 16:01:18 -05001222 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001223 int rc;
1224
Alex Elder0ce1a792012-07-03 16:01:18 -05001225 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001226 return;
1227
Alex Elder0bed54d2012-07-03 16:01:18 -05001228 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n",
1229 rbd_dev->header_name, notify_id, (int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder0ce1a792012-07-03 16:01:18 -05001231 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001232 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001233 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001234 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001235 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001236
Alex Elder0bed54d2012-07-03 16:01:18 -05001237 rbd_req_sync_notify_ack(rbd_dev, ver, notify_id, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001238}
1239
1240/*
1241 * Request sync osd watch
1242 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001243static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001244 const char *obj,
1245 u64 ver)
1246{
1247 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001248 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001249
1250 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1251 if (ret < 0)
1252 return ret;
1253
1254 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001255 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001256 if (ret < 0)
1257 goto fail;
1258
1259 ops[0].watch.ver = cpu_to_le64(ver);
Alex Elder0ce1a792012-07-03 16:01:18 -05001260 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001261 ops[0].watch.flag = 1;
1262
Alex Elder0ce1a792012-07-03 16:01:18 -05001263 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001264 CEPH_NOSNAP,
1265 0,
1266 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1267 ops,
1268 1, obj, 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001269 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001270
1271 if (ret < 0)
1272 goto fail_event;
1273
1274 rbd_destroy_ops(ops);
1275 return 0;
1276
1277fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001278 ceph_osdc_cancel_event(rbd_dev->watch_event);
1279 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001280fail:
1281 rbd_destroy_ops(ops);
1282 return ret;
1283}
1284
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001285/*
1286 * Request sync osd unwatch
1287 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001288static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001289 const char *obj)
1290{
1291 struct ceph_osd_req_op *ops;
1292
1293 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1294 if (ret < 0)
1295 return ret;
1296
1297 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001298 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001299 ops[0].watch.flag = 0;
1300
Alex Elder0ce1a792012-07-03 16:01:18 -05001301 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001302 CEPH_NOSNAP,
1303 0,
1304 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1305 ops,
1306 1, obj, 0, 0, NULL, NULL, NULL);
1307
1308 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001309 ceph_osdc_cancel_event(rbd_dev->watch_event);
1310 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001311 return ret;
1312}
1313
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001314struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001315 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316};
1317
1318static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1319{
Alex Elder0ce1a792012-07-03 16:01:18 -05001320 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1321 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001322 return;
1323
Alex Elder0ce1a792012-07-03 16:01:18 -05001324 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
Alex Elder0bed54d2012-07-03 16:01:18 -05001325 rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001326 notify_id, (int)opcode);
1327}
1328
1329/*
1330 * Request sync osd notify
1331 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001332static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001333 const char *obj)
1334{
1335 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001336 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001337 struct ceph_osd_event *event;
1338 struct rbd_notify_info info;
1339 int payload_len = sizeof(u32) + sizeof(u32);
1340 int ret;
1341
1342 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1343 if (ret < 0)
1344 return ret;
1345
Alex Elder0ce1a792012-07-03 16:01:18 -05001346 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001347
1348 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1349 (void *)&info, &event);
1350 if (ret < 0)
1351 goto fail;
1352
1353 ops[0].watch.ver = 1;
1354 ops[0].watch.flag = 1;
1355 ops[0].watch.cookie = event->cookie;
1356 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1357 ops[0].watch.timeout = 12;
1358
Alex Elder0ce1a792012-07-03 16:01:18 -05001359 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001360 CEPH_NOSNAP,
1361 0,
1362 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1363 ops,
1364 1, obj, 0, 0, NULL, NULL, NULL);
1365 if (ret < 0)
1366 goto fail_event;
1367
1368 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1369 dout("ceph_osdc_wait_event returned %d\n", ret);
1370 rbd_destroy_ops(ops);
1371 return 0;
1372
1373fail_event:
1374 ceph_osdc_cancel_event(event);
1375fail:
1376 rbd_destroy_ops(ops);
1377 return ret;
1378}
1379
1380/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001381 * Request sync osd read
1382 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001383static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001384 const char *obj,
1385 const char *cls,
1386 const char *method,
1387 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001388 int len,
1389 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001390{
1391 struct ceph_osd_req_op *ops;
1392 int cls_len = strlen(cls);
1393 int method_len = strlen(method);
1394 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1395 cls_len + method_len + len);
1396 if (ret < 0)
1397 return ret;
1398
1399 ops[0].cls.class_name = cls;
1400 ops[0].cls.class_len = (__u8)cls_len;
1401 ops[0].cls.method_name = method;
1402 ops[0].cls.method_len = (__u8)method_len;
1403 ops[0].cls.argc = 0;
1404 ops[0].cls.indata = data;
1405 ops[0].cls.indata_len = len;
1406
Alex Elder0ce1a792012-07-03 16:01:18 -05001407 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001408 CEPH_NOSNAP,
1409 0,
1410 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1411 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001412 1, obj, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001413
1414 rbd_destroy_ops(ops);
1415
1416 dout("cls_exec returned %d\n", ret);
1417 return ret;
1418}
1419
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001420static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1421{
1422 struct rbd_req_coll *coll =
1423 kzalloc(sizeof(struct rbd_req_coll) +
1424 sizeof(struct rbd_req_status) * num_reqs,
1425 GFP_ATOMIC);
1426
1427 if (!coll)
1428 return NULL;
1429 coll->total = num_reqs;
1430 kref_init(&coll->kref);
1431 return coll;
1432}
1433
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001434/*
1435 * block device queue callback
1436 */
1437static void rbd_rq_fn(struct request_queue *q)
1438{
1439 struct rbd_device *rbd_dev = q->queuedata;
1440 struct request *rq;
1441 struct bio_pair *bp = NULL;
1442
Alex Elder00f1f362012-02-07 12:03:36 -06001443 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001444 struct bio *bio;
1445 struct bio *rq_bio, *next_bio = NULL;
1446 bool do_write;
1447 int size, op_size = 0;
1448 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001449 int num_segs, cur_seg = 0;
1450 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001451
1452 /* peek at request from block layer */
1453 if (!rq)
1454 break;
1455
1456 dout("fetched request\n");
1457
1458 /* filter out block requests we don't understand */
1459 if ((rq->cmd_type != REQ_TYPE_FS)) {
1460 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001461 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001462 }
1463
1464 /* deduce our operation (read, write) */
1465 do_write = (rq_data_dir(rq) == WRITE);
1466
1467 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001468 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001469 rq_bio = rq->bio;
1470 if (do_write && rbd_dev->read_only) {
1471 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001472 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473 }
1474
1475 spin_unlock_irq(q->queue_lock);
1476
1477 dout("%s 0x%x bytes at 0x%llx\n",
1478 do_write ? "write" : "read",
Alex Elder593a9e72012-02-07 12:03:37 -06001479 size, blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001481 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1482 coll = rbd_alloc_coll(num_segs);
1483 if (!coll) {
1484 spin_lock_irq(q->queue_lock);
1485 __blk_end_request_all(rq, -ENOMEM);
Alex Elder00f1f362012-02-07 12:03:36 -06001486 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001487 }
1488
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001489 do {
1490 /* a bio clone to be passed down to OSD req */
1491 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1492 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001493 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001494 ofs, size,
1495 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001496 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001497 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1498 op_size, GFP_ATOMIC);
1499 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001500 rbd_coll_end_req_index(rq, coll, cur_seg,
1501 -ENOMEM, op_size);
1502 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503 }
1504
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001505
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001506 /* init OSD command: write or read */
1507 if (do_write)
1508 rbd_req_write(rq, rbd_dev,
1509 rbd_dev->header.snapc,
1510 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001511 op_size, bio,
1512 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513 else
1514 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001515 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001516 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001517 op_size, bio,
1518 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001519
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001520next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521 size -= op_size;
1522 ofs += op_size;
1523
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001524 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 rq_bio = next_bio;
1526 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001527 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528
1529 if (bp)
1530 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001531 spin_lock_irq(q->queue_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001532 }
1533}
1534
1535/*
1536 * a queue callback. Makes sure that we don't create a bio that spans across
1537 * multiple osd objects. One exception would be with a single page bios,
1538 * which we handle later at bio_chain_clone
1539 */
1540static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1541 struct bio_vec *bvec)
1542{
1543 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001544 unsigned int chunk_sectors;
1545 sector_t sector;
1546 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001547 int max;
1548
Alex Elder593a9e72012-02-07 12:03:37 -06001549 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1550 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1551 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1552
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001554 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555 if (max < 0)
1556 max = 0; /* bio_add cannot handle a negative return */
1557 if (max <= bvec->bv_len && bio_sectors == 0)
1558 return bvec->bv_len;
1559 return max;
1560}
1561
1562static void rbd_free_disk(struct rbd_device *rbd_dev)
1563{
1564 struct gendisk *disk = rbd_dev->disk;
1565
1566 if (!disk)
1567 return;
1568
1569 rbd_header_free(&rbd_dev->header);
1570
1571 if (disk->flags & GENHD_FL_UP)
1572 del_gendisk(disk);
1573 if (disk->queue)
1574 blk_cleanup_queue(disk->queue);
1575 put_disk(disk);
1576}
1577
1578/*
1579 * reload the ondisk the header
1580 */
1581static int rbd_read_header(struct rbd_device *rbd_dev,
1582 struct rbd_image_header *header)
1583{
1584 ssize_t rc;
1585 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001586 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001587 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001588 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001589
Alex Elder00f1f362012-02-07 12:03:36 -06001590 /*
1591 * First reads the fixed-size header to determine the number
1592 * of snapshots, then re-reads it, along with all snapshot
1593 * records as well as their stored names.
1594 */
1595 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001596 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001597 dh = kmalloc(len, GFP_KERNEL);
1598 if (!dh)
1599 return -ENOMEM;
1600
1601 rc = rbd_req_sync_read(rbd_dev,
1602 NULL, CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001603 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001604 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001605 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001606 if (rc < 0)
1607 goto out_dh;
1608
1609 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001610 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001611 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001612 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001613 " for image %s\n",
1614 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001615 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001616 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001617
Alex Elder00f1f362012-02-07 12:03:36 -06001618 if (snap_count == header->total_snaps)
1619 break;
1620
1621 snap_count = header->total_snaps;
1622 len = sizeof (*dh) +
1623 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1624 header->snap_names_len;
1625
1626 rbd_header_free(header);
1627 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001628 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001629 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001630
1631out_dh:
1632 kfree(dh);
1633 return rc;
1634}
1635
1636/*
1637 * create a snapshot
1638 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001639static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001640 const char *snap_name,
1641 gfp_t gfp_flags)
1642{
1643 int name_len = strlen(snap_name);
1644 u64 new_snapid;
1645 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001646 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001647 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001648 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001649
1650 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001651 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001652 return -EINVAL;
1653
Alex Elder0ce1a792012-07-03 16:01:18 -05001654 monc = &rbd_dev->rbd_client->client->monc;
1655 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001656 dout("created snapid=%lld\n", new_snapid);
1657 if (ret < 0)
1658 return ret;
1659
1660 data = kmalloc(name_len + 16, gfp_flags);
1661 if (!data)
1662 return -ENOMEM;
1663
Sage Weil916d4d62011-05-12 16:10:50 -07001664 p = data;
1665 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001666
Sage Weil916d4d62011-05-12 16:10:50 -07001667 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1668 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001669
Alex Elder0bed54d2012-07-03 16:01:18 -05001670 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001671 "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001672 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001673
Sage Weil916d4d62011-05-12 16:10:50 -07001674 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001675
1676 if (ret < 0)
1677 return ret;
1678
Alex Elder0ce1a792012-07-03 16:01:18 -05001679 down_write(&rbd_dev->header_rwsem);
1680 rbd_dev->header.snapc->seq = new_snapid;
1681 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001682
1683 return 0;
1684bad:
1685 return -ERANGE;
1686}
1687
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001688static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1689{
1690 struct rbd_snap *snap;
1691
1692 while (!list_empty(&rbd_dev->snaps)) {
1693 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1694 __rbd_remove_snap_dev(rbd_dev, snap);
1695 }
1696}
1697
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001698/*
1699 * only read the first part of the ondisk header, without the snaps info
1700 */
Josh Durgin263c6ca2011-12-05 10:43:42 -08001701static int __rbd_refresh_header(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001702{
1703 int ret;
1704 struct rbd_image_header h;
1705 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001706 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001707
1708 ret = rbd_read_header(rbd_dev, &h);
1709 if (ret < 0)
1710 return ret;
1711
Sage Weil9db4b3e2011-04-19 22:49:06 -07001712 /* resized? */
Alex Elder593a9e72012-02-07 12:03:37 -06001713 set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001714
Josh Durginc6666012011-11-21 17:11:12 -08001715 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001716
1717 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001718 if (rbd_dev->header.total_snaps &&
1719 rbd_dev->header.snapc->snaps[0] == snap_seq)
1720 /* pointing at the head, will need to follow that
1721 if head moves */
1722 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001723
Alex Elder849b4262012-07-09 21:04:24 -05001724 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001725 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001726 kfree(rbd_dev->header.snap_names);
1727 kfree(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001728
1729 rbd_dev->header.total_snaps = h.total_snaps;
1730 rbd_dev->header.snapc = h.snapc;
1731 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001732 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001733 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001734 /* Free the extra copy of the object prefix */
1735 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1736 kfree(h.object_prefix);
1737
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001738 if (follow_seq)
1739 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1740 else
1741 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001742
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001743 ret = __rbd_init_snaps_header(rbd_dev);
1744
Josh Durginc6666012011-11-21 17:11:12 -08001745 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001747 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001748}
1749
1750static int rbd_init_disk(struct rbd_device *rbd_dev)
1751{
1752 struct gendisk *disk;
1753 struct request_queue *q;
1754 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001755 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001756 u64 total_size = 0;
1757
1758 /* contact OSD, request size info about the object being mapped */
1759 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1760 if (rc)
1761 return rc;
1762
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001763 /* no need to lock here, as rbd_dev is not registered yet */
1764 rc = __rbd_init_snaps_header(rbd_dev);
1765 if (rc)
1766 return rc;
1767
Josh Durgincc9d7342011-11-21 18:19:13 -08001768 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001769 if (rc)
1770 return rc;
1771
1772 /* create gendisk info */
1773 rc = -ENOMEM;
1774 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1775 if (!disk)
1776 goto out;
1777
Alex Elderf0f8cef2012-01-29 13:57:44 -06001778 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001779 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780 disk->major = rbd_dev->major;
1781 disk->first_minor = 0;
1782 disk->fops = &rbd_bd_ops;
1783 disk->private_data = rbd_dev;
1784
1785 /* init rq */
1786 rc = -ENOMEM;
1787 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1788 if (!q)
1789 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001790
Alex Elder593a9e72012-02-07 12:03:37 -06001791 /* We use the default size, but let's be explicit about it. */
1792 blk_queue_physical_block_size(q, SECTOR_SIZE);
1793
Josh Durgin029bcbd2011-07-22 11:35:23 -07001794 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001795 segment_size = rbd_obj_bytes(&rbd_dev->header);
1796 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1797 blk_queue_max_segment_size(q, segment_size);
1798 blk_queue_io_min(q, segment_size);
1799 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001800
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001801 blk_queue_merge_bvec(q, rbd_merge_bvec);
1802 disk->queue = q;
1803
1804 q->queuedata = rbd_dev;
1805
1806 rbd_dev->disk = disk;
1807 rbd_dev->q = q;
1808
1809 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001810 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001811 add_disk(disk);
1812
1813 pr_info("%s: added with size 0x%llx\n",
1814 disk->disk_name, (unsigned long long)total_size);
1815 return 0;
1816
1817out_disk:
1818 put_disk(disk);
1819out:
1820 return rc;
1821}
1822
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001823/*
1824 sysfs
1825*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001826
Alex Elder593a9e72012-02-07 12:03:37 -06001827static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1828{
1829 return container_of(dev, struct rbd_device, dev);
1830}
1831
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001832static ssize_t rbd_size_show(struct device *dev,
1833 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001834{
Alex Elder593a9e72012-02-07 12:03:37 -06001835 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001836
1837 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001838}
1839
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001840static ssize_t rbd_major_show(struct device *dev,
1841 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001842{
Alex Elder593a9e72012-02-07 12:03:37 -06001843 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001844
1845 return sprintf(buf, "%d\n", rbd_dev->major);
1846}
1847
1848static ssize_t rbd_client_id_show(struct device *dev,
1849 struct device_attribute *attr, char *buf)
1850{
Alex Elder593a9e72012-02-07 12:03:37 -06001851 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001852
Alex Elder1dbb4392012-01-24 10:08:37 -06001853 return sprintf(buf, "client%lld\n",
1854 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001855}
1856
1857static ssize_t rbd_pool_show(struct device *dev,
1858 struct device_attribute *attr, char *buf)
1859{
Alex Elder593a9e72012-02-07 12:03:37 -06001860 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001861
1862 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1863}
1864
Alex Elder9bb2f332012-07-12 10:46:35 -05001865static ssize_t rbd_pool_id_show(struct device *dev,
1866 struct device_attribute *attr, char *buf)
1867{
1868 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1869
1870 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1871}
1872
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001873static ssize_t rbd_name_show(struct device *dev,
1874 struct device_attribute *attr, char *buf)
1875{
Alex Elder593a9e72012-02-07 12:03:37 -06001876 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001877
Alex Elder0bed54d2012-07-03 16:01:18 -05001878 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001879}
1880
1881static ssize_t rbd_snap_show(struct device *dev,
1882 struct device_attribute *attr,
1883 char *buf)
1884{
Alex Elder593a9e72012-02-07 12:03:37 -06001885 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001886
1887 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1888}
1889
1890static ssize_t rbd_image_refresh(struct device *dev,
1891 struct device_attribute *attr,
1892 const char *buf,
1893 size_t size)
1894{
Alex Elder593a9e72012-02-07 12:03:37 -06001895 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001896 int rc;
1897 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001898
1899 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1900
Josh Durgin263c6ca2011-12-05 10:43:42 -08001901 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001902 if (rc < 0)
1903 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001904
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001905 mutex_unlock(&ctl_mutex);
1906 return ret;
1907}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001908
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001909static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1910static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1911static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1912static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001913static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001914static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1915static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1916static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1917static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001918
1919static struct attribute *rbd_attrs[] = {
1920 &dev_attr_size.attr,
1921 &dev_attr_major.attr,
1922 &dev_attr_client_id.attr,
1923 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001924 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001925 &dev_attr_name.attr,
1926 &dev_attr_current_snap.attr,
1927 &dev_attr_refresh.attr,
1928 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001929 NULL
1930};
1931
1932static struct attribute_group rbd_attr_group = {
1933 .attrs = rbd_attrs,
1934};
1935
1936static const struct attribute_group *rbd_attr_groups[] = {
1937 &rbd_attr_group,
1938 NULL
1939};
1940
1941static void rbd_sysfs_dev_release(struct device *dev)
1942{
1943}
1944
1945static struct device_type rbd_device_type = {
1946 .name = "rbd",
1947 .groups = rbd_attr_groups,
1948 .release = rbd_sysfs_dev_release,
1949};
1950
1951
1952/*
1953 sysfs - snapshots
1954*/
1955
1956static ssize_t rbd_snap_size_show(struct device *dev,
1957 struct device_attribute *attr,
1958 char *buf)
1959{
1960 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1961
Josh Durgin35915382011-12-05 18:25:13 -08001962 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001963}
1964
1965static ssize_t rbd_snap_id_show(struct device *dev,
1966 struct device_attribute *attr,
1967 char *buf)
1968{
1969 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1970
Josh Durgin35915382011-12-05 18:25:13 -08001971 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001972}
1973
1974static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1975static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1976
1977static struct attribute *rbd_snap_attrs[] = {
1978 &dev_attr_snap_size.attr,
1979 &dev_attr_snap_id.attr,
1980 NULL,
1981};
1982
1983static struct attribute_group rbd_snap_attr_group = {
1984 .attrs = rbd_snap_attrs,
1985};
1986
1987static void rbd_snap_dev_release(struct device *dev)
1988{
1989 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1990 kfree(snap->name);
1991 kfree(snap);
1992}
1993
1994static const struct attribute_group *rbd_snap_attr_groups[] = {
1995 &rbd_snap_attr_group,
1996 NULL
1997};
1998
1999static struct device_type rbd_snap_device_type = {
2000 .groups = rbd_snap_attr_groups,
2001 .release = rbd_snap_dev_release,
2002};
2003
2004static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
2005 struct rbd_snap *snap)
2006{
2007 list_del(&snap->node);
2008 device_unregister(&snap->dev);
2009}
2010
2011static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2012 struct rbd_snap *snap,
2013 struct device *parent)
2014{
2015 struct device *dev = &snap->dev;
2016 int ret;
2017
2018 dev->type = &rbd_snap_device_type;
2019 dev->parent = parent;
2020 dev->release = rbd_snap_dev_release;
2021 dev_set_name(dev, "snap_%s", snap->name);
2022 ret = device_register(dev);
2023
2024 return ret;
2025}
2026
2027static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2028 int i, const char *name,
2029 struct rbd_snap **snapp)
2030{
2031 int ret;
2032 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2033 if (!snap)
2034 return -ENOMEM;
2035 snap->name = kstrdup(name, GFP_KERNEL);
2036 snap->size = rbd_dev->header.snap_sizes[i];
2037 snap->id = rbd_dev->header.snapc->snaps[i];
2038 if (device_is_registered(&rbd_dev->dev)) {
2039 ret = rbd_register_snap_dev(rbd_dev, snap,
2040 &rbd_dev->dev);
2041 if (ret < 0)
2042 goto err;
2043 }
2044 *snapp = snap;
2045 return 0;
2046err:
2047 kfree(snap->name);
2048 kfree(snap);
2049 return ret;
2050}
2051
2052/*
2053 * search for the previous snap in a null delimited string list
2054 */
2055const char *rbd_prev_snap_name(const char *name, const char *start)
2056{
2057 if (name < start + 2)
2058 return NULL;
2059
2060 name -= 2;
2061 while (*name) {
2062 if (name == start)
2063 return start;
2064 name--;
2065 }
2066 return name + 1;
2067}
2068
2069/*
2070 * compare the old list of snapshots that we have to what's in the header
2071 * and update it accordingly. Note that the header holds the snapshots
2072 * in a reverse order (from newest to oldest) and we need to go from
2073 * older to new so that we don't get a duplicate snap name when
2074 * doing the process (e.g., removed snapshot and recreated a new
2075 * one with the same name.
2076 */
2077static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2078{
2079 const char *name, *first_name;
2080 int i = rbd_dev->header.total_snaps;
2081 struct rbd_snap *snap, *old_snap = NULL;
2082 int ret;
2083 struct list_head *p, *n;
2084
2085 first_name = rbd_dev->header.snap_names;
2086 name = first_name + rbd_dev->header.snap_names_len;
2087
2088 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2089 u64 cur_id;
2090
2091 old_snap = list_entry(p, struct rbd_snap, node);
2092
2093 if (i)
2094 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2095
2096 if (!i || old_snap->id < cur_id) {
2097 /* old_snap->id was skipped, thus was removed */
2098 __rbd_remove_snap_dev(rbd_dev, old_snap);
2099 continue;
2100 }
2101 if (old_snap->id == cur_id) {
2102 /* we have this snapshot already */
2103 i--;
2104 name = rbd_prev_snap_name(name, first_name);
2105 continue;
2106 }
2107 for (; i > 0;
2108 i--, name = rbd_prev_snap_name(name, first_name)) {
2109 if (!name) {
2110 WARN_ON(1);
2111 return -EINVAL;
2112 }
2113 cur_id = rbd_dev->header.snapc->snaps[i];
2114 /* snapshot removal? handle it above */
2115 if (cur_id >= old_snap->id)
2116 break;
2117 /* a new snapshot */
2118 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2119 if (ret < 0)
2120 return ret;
2121
2122 /* note that we add it backward so using n and not p */
2123 list_add(&snap->node, n);
2124 p = &snap->node;
2125 }
2126 }
2127 /* we're done going over the old snap list, just add what's left */
2128 for (; i > 0; i--) {
2129 name = rbd_prev_snap_name(name, first_name);
2130 if (!name) {
2131 WARN_ON(1);
2132 return -EINVAL;
2133 }
2134 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2135 if (ret < 0)
2136 return ret;
2137 list_add(&snap->node, &rbd_dev->snaps);
2138 }
2139
2140 return 0;
2141}
2142
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002143static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2144{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002145 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002146 struct device *dev;
2147 struct rbd_snap *snap;
2148
2149 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2150 dev = &rbd_dev->dev;
2151
2152 dev->bus = &rbd_bus_type;
2153 dev->type = &rbd_device_type;
2154 dev->parent = &rbd_root_dev;
2155 dev->release = rbd_dev_release;
2156 dev_set_name(dev, "%d", rbd_dev->id);
2157 ret = device_register(dev);
2158 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002159 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002160
2161 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2162 ret = rbd_register_snap_dev(rbd_dev, snap,
2163 &rbd_dev->dev);
2164 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002165 break;
2166 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002167out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002168 mutex_unlock(&ctl_mutex);
2169 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002170}
2171
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002172static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2173{
2174 device_unregister(&rbd_dev->dev);
2175}
2176
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002177static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2178{
2179 int ret, rc;
2180
2181 do {
Alex Elder0bed54d2012-07-03 16:01:18 -05002182 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002183 rbd_dev->header.obj_version);
2184 if (ret == -ERANGE) {
2185 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08002186 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002187 mutex_unlock(&ctl_mutex);
2188 if (rc < 0)
2189 return rc;
2190 }
2191 } while (ret == -ERANGE);
2192
2193 return ret;
2194}
2195
Alex Elder1ddbe942012-01-29 13:57:44 -06002196static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2197
2198/*
Alex Elder499afd52012-02-02 08:13:29 -06002199 * Get a unique rbd identifier for the given new rbd_dev, and add
2200 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002201 */
Alex Elder499afd52012-02-02 08:13:29 -06002202static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002203{
Alex Elder499afd52012-02-02 08:13:29 -06002204 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2205
2206 spin_lock(&rbd_dev_list_lock);
2207 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2208 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002209}
Alex Elderb7f23c32012-01-29 13:57:43 -06002210
Alex Elder1ddbe942012-01-29 13:57:44 -06002211/*
Alex Elder499afd52012-02-02 08:13:29 -06002212 * Remove an rbd_dev from the global list, and record that its
2213 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002214 */
Alex Elder499afd52012-02-02 08:13:29 -06002215static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002216{
Alex Elderd184f6b2012-01-29 13:57:44 -06002217 struct list_head *tmp;
2218 int rbd_id = rbd_dev->id;
2219 int max_id;
2220
2221 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002222
2223 spin_lock(&rbd_dev_list_lock);
2224 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002225
2226 /*
2227 * If the id being "put" is not the current maximum, there
2228 * is nothing special we need to do.
2229 */
2230 if (rbd_id != atomic64_read(&rbd_id_max)) {
2231 spin_unlock(&rbd_dev_list_lock);
2232 return;
2233 }
2234
2235 /*
2236 * We need to update the current maximum id. Search the
2237 * list to find out what it is. We're more likely to find
2238 * the maximum at the end, so search the list backward.
2239 */
2240 max_id = 0;
2241 list_for_each_prev(tmp, &rbd_dev_list) {
2242 struct rbd_device *rbd_dev;
2243
2244 rbd_dev = list_entry(tmp, struct rbd_device, node);
2245 if (rbd_id > max_id)
2246 max_id = rbd_id;
2247 }
Alex Elder499afd52012-02-02 08:13:29 -06002248 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002249
Alex Elder1ddbe942012-01-29 13:57:44 -06002250 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002251 * The max id could have been updated by rbd_id_get(), in
2252 * which case it now accurately reflects the new maximum.
2253 * Be careful not to overwrite the maximum value in that
2254 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002255 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002256 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002257}
2258
Alex Eldera725f65e2012-02-02 08:13:30 -06002259/*
Alex Eldere28fff262012-02-02 08:13:30 -06002260 * Skips over white space at *buf, and updates *buf to point to the
2261 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002262 * the token (string of non-white space characters) found. Note
2263 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002264 */
2265static inline size_t next_token(const char **buf)
2266{
2267 /*
2268 * These are the characters that produce nonzero for
2269 * isspace() in the "C" and "POSIX" locales.
2270 */
2271 const char *spaces = " \f\n\r\t\v";
2272
2273 *buf += strspn(*buf, spaces); /* Find start of token */
2274
2275 return strcspn(*buf, spaces); /* Return token length */
2276}
2277
2278/*
2279 * Finds the next token in *buf, and if the provided token buffer is
2280 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002281 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2282 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002283 *
2284 * Returns the length of the token found (not including the '\0').
2285 * Return value will be 0 if no token is found, and it will be >=
2286 * token_size if the token would not fit.
2287 *
Alex Elder593a9e72012-02-07 12:03:37 -06002288 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002289 * found token. Note that this occurs even if the token buffer is
2290 * too small to hold it.
2291 */
2292static inline size_t copy_token(const char **buf,
2293 char *token,
2294 size_t token_size)
2295{
2296 size_t len;
2297
2298 len = next_token(buf);
2299 if (len < token_size) {
2300 memcpy(token, *buf, len);
2301 *(token + len) = '\0';
2302 }
2303 *buf += len;
2304
2305 return len;
2306}
2307
2308/*
Alex Elderea3352f2012-07-09 21:04:23 -05002309 * Finds the next token in *buf, dynamically allocates a buffer big
2310 * enough to hold a copy of it, and copies the token into the new
2311 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2312 * that a duplicate buffer is created even for a zero-length token.
2313 *
2314 * Returns a pointer to the newly-allocated duplicate, or a null
2315 * pointer if memory for the duplicate was not available. If
2316 * the lenp argument is a non-null pointer, the length of the token
2317 * (not including the '\0') is returned in *lenp.
2318 *
2319 * If successful, the *buf pointer will be updated to point beyond
2320 * the end of the found token.
2321 *
2322 * Note: uses GFP_KERNEL for allocation.
2323 */
2324static inline char *dup_token(const char **buf, size_t *lenp)
2325{
2326 char *dup;
2327 size_t len;
2328
2329 len = next_token(buf);
2330 dup = kmalloc(len + 1, GFP_KERNEL);
2331 if (!dup)
2332 return NULL;
2333
2334 memcpy(dup, *buf, len);
2335 *(dup + len) = '\0';
2336 *buf += len;
2337
2338 if (lenp)
2339 *lenp = len;
2340
2341 return dup;
2342}
2343
2344/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002345 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002346 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2347 * on the list of monitor addresses and other options provided via
2348 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002349 *
2350 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002351 */
2352static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2353 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002354 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002355 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002356 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002357 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002358{
Alex Elderd22f76e2012-07-12 10:46:35 -05002359 size_t len;
2360 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002361
2362 /* The first four tokens are required */
2363
Alex Elder7ef32142012-02-02 08:13:30 -06002364 len = next_token(&buf);
2365 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002366 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002367 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002368 *mon_addrs = buf;
2369
2370 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002371
Alex Eldere28fff262012-02-02 08:13:30 -06002372 len = copy_token(&buf, options, options_size);
2373 if (!len || len >= options_size)
2374 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002375
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002376 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002377 rbd_dev->pool_name = dup_token(&buf, NULL);
2378 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002379 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002380
Alex Elder0bed54d2012-07-03 16:01:18 -05002381 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2382 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002383 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002384
Alex Eldercb8627c2012-07-09 21:04:23 -05002385 /* Create the name of the header object */
2386
Alex Elder0bed54d2012-07-03 16:01:18 -05002387 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002388 + sizeof (RBD_SUFFIX),
2389 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002390 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002391 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002392 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002393
Alex Eldere28fff262012-02-02 08:13:30 -06002394 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002395 * The snapshot name is optional. If none is is supplied,
2396 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002397 */
Alex Elder820a5f32012-07-09 21:04:24 -05002398 rbd_dev->snap_name = dup_token(&buf, &len);
2399 if (!rbd_dev->snap_name)
2400 goto out_err;
2401 if (!len) {
2402 /* Replace the empty name with the default */
2403 kfree(rbd_dev->snap_name);
2404 rbd_dev->snap_name
2405 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2406 if (!rbd_dev->snap_name)
2407 goto out_err;
2408
Alex Eldere28fff262012-02-02 08:13:30 -06002409 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2410 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002411 }
Alex Eldere28fff262012-02-02 08:13:30 -06002412
Alex Eldera725f65e2012-02-02 08:13:30 -06002413 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002414
2415out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002416 kfree(rbd_dev->header_name);
2417 kfree(rbd_dev->image_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002418 kfree(rbd_dev->pool_name);
2419 rbd_dev->pool_name = NULL;
2420
2421 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002422}
2423
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002424static ssize_t rbd_add(struct bus_type *bus,
2425 const char *buf,
2426 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002427{
Alex Eldercb8627c2012-07-09 21:04:23 -05002428 char *options;
2429 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002430 const char *mon_addrs = NULL;
2431 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002432 struct ceph_osd_client *osdc;
2433 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002434
2435 if (!try_module_get(THIS_MODULE))
2436 return -ENODEV;
2437
Alex Elder27cc2592012-02-02 08:13:30 -06002438 options = kmalloc(count, GFP_KERNEL);
2439 if (!options)
2440 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002441 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2442 if (!rbd_dev)
2443 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002444
2445 /* static rbd_device initialization */
2446 spin_lock_init(&rbd_dev->lock);
2447 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002448 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002449 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002450
Josh Durginc6666012011-11-21 17:11:12 -08002451 init_rwsem(&rbd_dev->header_rwsem);
Alex Elder0e805a12012-01-11 19:42:15 -08002452
Alex Elderd184f6b2012-01-29 13:57:44 -06002453 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002454 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002455
Alex Eldera725f65e2012-02-02 08:13:30 -06002456 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002457 BUILD_BUG_ON(DEV_NAME_LEN
2458 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2459 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
Alex Eldere124a822012-01-29 13:57:44 -06002460
Alex Eldera725f65e2012-02-02 08:13:30 -06002461 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002462 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002463 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002464 if (rc)
2465 goto err_put_id;
2466
Alex Elder5214ecc2012-02-02 08:13:30 -06002467 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2468 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002469 if (IS_ERR(rbd_dev->rbd_client)) {
2470 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002471 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002472 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002473
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002474 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002475 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002476 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2477 if (rc < 0)
2478 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002479 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002480
2481 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002482 rc = register_blkdev(0, rbd_dev->name);
2483 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002484 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002485 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002486
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002487 rc = rbd_bus_add_dev(rbd_dev);
2488 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002489 goto err_out_blkdev;
2490
Alex Elder32eec682012-02-08 16:11:14 -06002491 /*
2492 * At this point cleanup in the event of an error is the job
2493 * of the sysfs code (initiated by rbd_bus_del_dev()).
2494 *
2495 * Set up and announce blkdev mapping.
2496 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002497 rc = rbd_init_disk(rbd_dev);
2498 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002499 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002500
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002501 rc = rbd_init_watch_dev(rbd_dev);
2502 if (rc)
2503 goto err_out_bus;
2504
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002505 return count;
2506
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002507err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002508 /* this will also clean up rest of rbd_dev stuff */
2509
2510 rbd_bus_del_dev(rbd_dev);
2511 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002512 return rc;
2513
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002514err_out_blkdev:
2515 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2516err_out_client:
2517 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002518err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002519 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002520 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002521 kfree(rbd_dev->header_name);
2522 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002523 kfree(rbd_dev->pool_name);
2524 }
Alex Elder499afd52012-02-02 08:13:29 -06002525 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002526err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002527 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002528 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002529
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002530 dout("Error adding device %s\n", buf);
2531 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002532
2533 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002534}
2535
2536static struct rbd_device *__rbd_get_dev(unsigned long id)
2537{
2538 struct list_head *tmp;
2539 struct rbd_device *rbd_dev;
2540
Alex Eldere124a822012-01-29 13:57:44 -06002541 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002542 list_for_each(tmp, &rbd_dev_list) {
2543 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a822012-01-29 13:57:44 -06002544 if (rbd_dev->id == id) {
2545 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002546 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002547 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002548 }
Alex Eldere124a822012-01-29 13:57:44 -06002549 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002550 return NULL;
2551}
2552
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002553static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002554{
Alex Elder593a9e72012-02-07 12:03:37 -06002555 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002556
Alex Elder1dbb4392012-01-24 10:08:37 -06002557 if (rbd_dev->watch_request) {
2558 struct ceph_client *client = rbd_dev->rbd_client->client;
2559
2560 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002561 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002562 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002563 if (rbd_dev->watch_event)
Alex Elder0bed54d2012-07-03 16:01:18 -05002564 rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002565
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002566 rbd_put_client(rbd_dev);
2567
2568 /* clean up and free blkdev */
2569 rbd_free_disk(rbd_dev);
2570 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002571
2572 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002573 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002574 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002575 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002576 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002577 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002578 kfree(rbd_dev);
2579
2580 /* release module ref */
2581 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002582}
2583
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002584static ssize_t rbd_remove(struct bus_type *bus,
2585 const char *buf,
2586 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002587{
2588 struct rbd_device *rbd_dev = NULL;
2589 int target_id, rc;
2590 unsigned long ul;
2591 int ret = count;
2592
2593 rc = strict_strtoul(buf, 10, &ul);
2594 if (rc)
2595 return rc;
2596
2597 /* convert to int; abort if we lost anything in the conversion */
2598 target_id = (int) ul;
2599 if (target_id != ul)
2600 return -EINVAL;
2601
2602 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2603
2604 rbd_dev = __rbd_get_dev(target_id);
2605 if (!rbd_dev) {
2606 ret = -ENOENT;
2607 goto done;
2608 }
2609
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002610 __rbd_remove_all_snaps(rbd_dev);
2611 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002612
2613done:
2614 mutex_unlock(&ctl_mutex);
2615 return ret;
2616}
2617
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002618static ssize_t rbd_snap_add(struct device *dev,
2619 struct device_attribute *attr,
2620 const char *buf,
2621 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002622{
Alex Elder593a9e72012-02-07 12:03:37 -06002623 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002624 int ret;
2625 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002626 if (!name)
2627 return -ENOMEM;
2628
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002629 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002630
2631 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2632
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002633 ret = rbd_header_add_snap(rbd_dev,
2634 name, GFP_KERNEL);
2635 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002636 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002637
Josh Durgin263c6ca2011-12-05 10:43:42 -08002638 ret = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002639 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002640 goto err_unlock;
2641
2642 /* shouldn't hold ctl_mutex when notifying.. notify might
2643 trigger a watch callback that would need to get that mutex */
2644 mutex_unlock(&ctl_mutex);
2645
2646 /* make a best effort, don't error if failed */
Alex Elder0bed54d2012-07-03 16:01:18 -05002647 rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002648
2649 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002650 kfree(name);
2651 return ret;
2652
2653err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002654 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002655 kfree(name);
2656 return ret;
2657}
2658
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002659/*
2660 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002661 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002662 */
2663static int rbd_sysfs_init(void)
2664{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002665 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666
Alex Elderfed4c142012-02-07 12:03:36 -06002667 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002668 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002669 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002670
Alex Elderfed4c142012-02-07 12:03:36 -06002671 ret = bus_register(&rbd_bus_type);
2672 if (ret < 0)
2673 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002674
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002675 return ret;
2676}
2677
2678static void rbd_sysfs_cleanup(void)
2679{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002680 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002681 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002682}
2683
2684int __init rbd_init(void)
2685{
2686 int rc;
2687
2688 rc = rbd_sysfs_init();
2689 if (rc)
2690 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002691 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002692 return 0;
2693}
2694
2695void __exit rbd_exit(void)
2696{
2697 rbd_sysfs_cleanup();
2698}
2699
2700module_init(rbd_init);
2701module_exit(rbd_exit);
2702
2703MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2704MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2705MODULE_DESCRIPTION("rados block device");
2706
2707/* following authorship retained from original osdblk.c */
2708MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2709
2710MODULE_LICENSE("GPL");