blob: b124442dab3a1156436ae23dc4502084aee03047 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
85 u64 snap_seq;
86 u32 total_snaps;
87
88 char *snap_names;
89 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090
91 u64 obj_version;
92};
93
94struct rbd_options {
95 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096};
97
98/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060099 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700100 */
101struct rbd_client {
102 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700103 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104 struct kref kref;
105 struct list_head node;
106};
107
108/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600109 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700110 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700111struct rbd_req_status {
112 int done;
113 int rc;
114 u64 bytes;
115};
116
117/*
118 * a collection of requests
119 */
120struct rbd_req_coll {
121 int total;
122 int num_done;
123 struct kref kref;
124 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700125};
126
Alex Elderf0f8cef2012-01-29 13:57:44 -0600127/*
128 * a single io request
129 */
130struct rbd_request {
131 struct request *rq; /* blk layer request */
132 struct bio *bio; /* cloned bio */
133 struct page **pages; /* list of used pages */
134 u64 len;
135 int coll_index;
136 struct rbd_req_coll *coll;
137};
138
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800139struct rbd_snap {
140 struct device dev;
141 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800142 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800143 struct list_head node;
144 u64 id;
145};
146
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700147/*
148 * a single device
149 */
150struct rbd_device {
151 int id; /* blkdev unique id */
152
153 int major; /* blkdev assigned major */
154 struct gendisk *disk; /* blkdev's gendisk and rq */
155 struct request_queue *q;
156
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157 struct rbd_client *rbd_client;
158
159 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
160
161 spinlock_t lock; /* queue lock */
162
163 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500164 char *image_name;
165 size_t image_name_len;
166 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500167 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500168 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700170 struct ceph_osd_event *watch_event;
171 struct ceph_osd_request *watch_request;
172
Josh Durginc6666012011-11-21 17:11:12 -0800173 /* protects updating the header */
174 struct rw_semaphore header_rwsem;
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgin77dfe992011-11-21 13:04:42 -0800176 u64 snap_id; /* current snapshot id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700177 int read_only;
178
179 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800180
181 /* list of snapshots */
182 struct list_head snaps;
183
184 /* sysfs related */
185 struct device dev;
186};
187
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600189
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700190static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600191static DEFINE_SPINLOCK(rbd_dev_list_lock);
192
Alex Elder432b8582012-01-29 13:57:44 -0600193static LIST_HEAD(rbd_client_list); /* clients */
194static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700195
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800196static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
197static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800198static ssize_t rbd_snap_add(struct device *dev,
199 struct device_attribute *attr,
200 const char *buf,
201 size_t count);
202static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700203 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800204
Alex Elderf0f8cef2012-01-29 13:57:44 -0600205static ssize_t rbd_add(struct bus_type *bus, const char *buf,
206 size_t count);
207static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
208 size_t count);
209
210static struct bus_attribute rbd_bus_attrs[] = {
211 __ATTR(add, S_IWUSR, NULL, rbd_add),
212 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
213 __ATTR_NULL
214};
215
216static struct bus_type rbd_bus_type = {
217 .name = "rbd",
218 .bus_attrs = rbd_bus_attrs,
219};
220
221static void rbd_root_dev_release(struct device *dev)
222{
223}
224
225static struct device rbd_root_dev = {
226 .init_name = "rbd",
227 .release = rbd_root_dev_release,
228};
229
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
232{
233 return get_device(&rbd_dev->dev);
234}
235
236static void rbd_put_dev(struct rbd_device *rbd_dev)
237{
238 put_device(&rbd_dev->dev);
239}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700240
Josh Durgin263c6ca2011-12-05 10:43:42 -0800241static int __rbd_refresh_header(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700242
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700243static int rbd_open(struct block_device *bdev, fmode_t mode)
244{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600245 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700246
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800247 rbd_get_dev(rbd_dev);
248
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249 set_device_ro(bdev, rbd_dev->read_only);
250
251 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
252 return -EROFS;
253
254 return 0;
255}
256
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800257static int rbd_release(struct gendisk *disk, fmode_t mode)
258{
259 struct rbd_device *rbd_dev = disk->private_data;
260
261 rbd_put_dev(rbd_dev);
262
263 return 0;
264}
265
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700266static const struct block_device_operations rbd_bd_ops = {
267 .owner = THIS_MODULE,
268 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800269 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270};
271
272/*
273 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500274 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700275 */
Alex Elder43ae4702012-07-03 16:01:18 -0500276static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700277 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700278{
279 struct rbd_client *rbdc;
280 int ret = -ENOMEM;
281
282 dout("rbd_client_create\n");
283 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
284 if (!rbdc)
285 goto out_opt;
286
287 kref_init(&rbdc->kref);
288 INIT_LIST_HEAD(&rbdc->node);
289
Alex Elderbc534d82012-01-29 13:57:44 -0600290 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
291
Alex Elder43ae4702012-07-03 16:01:18 -0500292 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600294 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500295 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296
297 ret = ceph_open_session(rbdc->client);
298 if (ret < 0)
299 goto out_err;
300
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700301 rbdc->rbd_opts = rbd_opts;
302
Alex Elder432b8582012-01-29 13:57:44 -0600303 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600305 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306
Alex Elderbc534d82012-01-29 13:57:44 -0600307 mutex_unlock(&ctl_mutex);
308
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309 dout("rbd_client_create created %p\n", rbdc);
310 return rbdc;
311
312out_err:
313 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600314out_mutex:
315 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316 kfree(rbdc);
317out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500318 if (ceph_opts)
319 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400320 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700321}
322
323/*
324 * Find a ceph client with specific addr and configuration.
325 */
Alex Elder43ae4702012-07-03 16:01:18 -0500326static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327{
328 struct rbd_client *client_node;
329
Alex Elder43ae4702012-07-03 16:01:18 -0500330 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700331 return NULL;
332
333 list_for_each_entry(client_node, &rbd_client_list, node)
Alex Elder43ae4702012-07-03 16:01:18 -0500334 if (!ceph_compare_options(ceph_opts, client_node->client))
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700335 return client_node;
336 return NULL;
337}
338
339/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700340 * mount options
341 */
342enum {
343 Opt_notify_timeout,
344 Opt_last_int,
345 /* int args above */
346 Opt_last_string,
347 /* string args above */
348};
349
Alex Elder43ae4702012-07-03 16:01:18 -0500350static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700351 {Opt_notify_timeout, "notify_timeout=%d"},
352 /* int args above */
353 /* string args above */
354 {-1, NULL}
355};
356
357static int parse_rbd_opts_token(char *c, void *private)
358{
Alex Elder43ae4702012-07-03 16:01:18 -0500359 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700360 substring_t argstr[MAX_OPT_ARGS];
361 int token, intval, ret;
362
Alex Elder43ae4702012-07-03 16:01:18 -0500363 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700364 if (token < 0)
365 return -EINVAL;
366
367 if (token < Opt_last_int) {
368 ret = match_int(&argstr[0], &intval);
369 if (ret < 0) {
370 pr_err("bad mount option arg (not int) "
371 "at '%s'\n", c);
372 return ret;
373 }
374 dout("got int token %d val %d\n", token, intval);
375 } else if (token > Opt_last_int && token < Opt_last_string) {
376 dout("got string token %d val %s\n", token,
377 argstr[0].from);
378 } else {
379 dout("got token %d\n", token);
380 }
381
382 switch (token) {
383 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500384 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700385 break;
386 default:
387 BUG_ON(token);
388 }
389 return 0;
390}
391
392/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700393 * Get a ceph client with specific addr and configuration, if one does
394 * not exist create it.
395 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600396static struct rbd_client *rbd_get_client(const char *mon_addr,
397 size_t mon_addr_len,
398 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700399{
400 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500401 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700402 struct rbd_options *rbd_opts;
403
404 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
405 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600406 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700407
408 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700409
Alex Elder43ae4702012-07-03 16:01:18 -0500410 ceph_opts = ceph_parse_options(options, mon_addr,
411 mon_addr + mon_addr_len,
412 parse_rbd_opts_token, rbd_opts);
413 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600414 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500415 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600416 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417
Alex Elder432b8582012-01-29 13:57:44 -0600418 spin_lock(&rbd_client_list_lock);
Alex Elder43ae4702012-07-03 16:01:18 -0500419 rbdc = __rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700420 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600421 /* using an existing client */
422 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600423 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600424
Alex Elder43ae4702012-07-03 16:01:18 -0500425 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600426 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700427
Alex Elderd720bcb2012-02-02 08:13:30 -0600428 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700429 }
Alex Elder432b8582012-01-29 13:57:44 -0600430 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431
Alex Elder43ae4702012-07-03 16:01:18 -0500432 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600433
Alex Elderd720bcb2012-02-02 08:13:30 -0600434 if (IS_ERR(rbdc))
435 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436
Alex Elderd720bcb2012-02-02 08:13:30 -0600437 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438}
439
440/*
441 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600442 *
Alex Elder432b8582012-01-29 13:57:44 -0600443 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444 */
445static void rbd_client_release(struct kref *kref)
446{
447 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
448
449 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500450 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500452 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453
454 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700455 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456 kfree(rbdc);
457}
458
459/*
460 * Drop reference to ceph client node. If it's not referenced anymore, release
461 * it.
462 */
463static void rbd_put_client(struct rbd_device *rbd_dev)
464{
465 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
466 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467}
468
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700469/*
470 * Destroy requests collection
471 */
472static void rbd_coll_release(struct kref *kref)
473{
474 struct rbd_req_coll *coll =
475 container_of(kref, struct rbd_req_coll, kref);
476
477 dout("rbd_coll_release %p\n", coll);
478 kfree(coll);
479}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700480
481/*
482 * Create a new header structure, translate header format from the on-disk
483 * header.
484 */
485static int rbd_header_from_disk(struct rbd_image_header *header,
486 struct rbd_image_header_ondisk *ondisk,
Xi Wang50f7c4c2012-04-20 15:49:44 -0500487 u32 allocated_snaps,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488 gfp_t gfp_flags)
489{
Xi Wang50f7c4c2012-04-20 15:49:44 -0500490 u32 i, snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700491
Alex Elder21079782012-01-24 10:08:36 -0600492 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800493 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800494
Alex Elder00f1f362012-02-07 12:03:36 -0600495 snap_count = le32_to_cpu(ondisk->snap_count);
Xi Wang50f7c4c2012-04-20 15:49:44 -0500496 if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
497 / sizeof (*ondisk))
498 return -EINVAL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700499 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Yan, Zhengf9f9a192012-06-06 09:15:33 -0500500 snap_count * sizeof(u64),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700501 gfp_flags);
502 if (!header->snapc)
503 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600504
Alex Elder00f1f362012-02-07 12:03:36 -0600505 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700506 if (snap_count) {
507 header->snap_names = kmalloc(header->snap_names_len,
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500508 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700509 if (!header->snap_names)
510 goto err_snapc;
511 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500512 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700513 if (!header->snap_sizes)
514 goto err_names;
515 } else {
516 header->snap_names = NULL;
517 header->snap_sizes = NULL;
518 }
Alex Elder849b4262012-07-09 21:04:24 -0500519
520 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
521 gfp_flags);
522 if (!header->object_prefix)
523 goto err_sizes;
524
Alex Elderca1e49a2012-07-10 20:30:09 -0500525 memcpy(header->object_prefix, ondisk->block_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700526 sizeof(ondisk->block_name));
Alex Elder849b4262012-07-09 21:04:24 -0500527 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700528
529 header->image_size = le64_to_cpu(ondisk->image_size);
530 header->obj_order = ondisk->options.order;
531 header->crypt_type = ondisk->options.crypt_type;
532 header->comp_type = ondisk->options.comp_type;
533
534 atomic_set(&header->snapc->nref, 1);
535 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
536 header->snapc->num_snaps = snap_count;
537 header->total_snaps = snap_count;
538
Alex Elder21079782012-01-24 10:08:36 -0600539 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540 for (i = 0; i < snap_count; i++) {
541 header->snapc->snaps[i] =
542 le64_to_cpu(ondisk->snaps[i].id);
543 header->snap_sizes[i] =
544 le64_to_cpu(ondisk->snaps[i].image_size);
545 }
546
547 /* copy snapshot names */
548 memcpy(header->snap_names, &ondisk->snaps[i],
549 header->snap_names_len);
550 }
551
552 return 0;
553
Alex Elder849b4262012-07-09 21:04:24 -0500554err_sizes:
555 kfree(header->snap_sizes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700556err_names:
557 kfree(header->snap_names);
558err_snapc:
559 kfree(header->snapc);
Alex Elder00f1f362012-02-07 12:03:36 -0600560 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700561}
562
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700563static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
564 u64 *seq, u64 *size)
565{
566 int i;
567 char *p = header->snap_names;
568
Alex Elder00f1f362012-02-07 12:03:36 -0600569 for (i = 0; i < header->total_snaps; i++) {
570 if (!strcmp(snap_name, p)) {
571
572 /* Found it. Pass back its id and/or size */
573
574 if (seq)
575 *seq = header->snapc->snaps[i];
576 if (size)
577 *size = header->snap_sizes[i];
578 return i;
579 }
580 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700581 }
Alex Elder00f1f362012-02-07 12:03:36 -0600582 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700583}
584
Alex Elder0ce1a792012-07-03 16:01:18 -0500585static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586{
Alex Elder0ce1a792012-07-03 16:01:18 -0500587 struct rbd_image_header *header = &rbd_dev->header;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700588 struct ceph_snap_context *snapc = header->snapc;
589 int ret = -ENOENT;
590
Alex Elder0ce1a792012-07-03 16:01:18 -0500591 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592
Alex Elder0ce1a792012-07-03 16:01:18 -0500593 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800594 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700595 if (header->total_snaps)
596 snapc->seq = header->snap_seq;
597 else
598 snapc->seq = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -0500599 rbd_dev->snap_id = CEPH_NOSNAP;
600 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700601 if (size)
602 *size = header->image_size;
603 } else {
Alex Elder0ce1a792012-07-03 16:01:18 -0500604 ret = snap_by_name(header, rbd_dev->snap_name,
605 &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606 if (ret < 0)
607 goto done;
Alex Elder0ce1a792012-07-03 16:01:18 -0500608 rbd_dev->snap_id = snapc->seq;
609 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610 }
611
612 ret = 0;
613done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500614 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700615 return ret;
616}
617
618static void rbd_header_free(struct rbd_image_header *header)
619{
Alex Elder849b4262012-07-09 21:04:24 -0500620 kfree(header->object_prefix);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621 kfree(header->snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -0500622 kfree(header->snap_names);
623 kfree(header->snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624}
625
626/*
627 * get the actual striped segment name, offset and length
628 */
629static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500630 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631 u64 ofs, u64 len,
632 char *seg_name, u64 *segofs)
633{
634 u64 seg = ofs >> header->obj_order;
635
636 if (seg_name)
637 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500638 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639
640 ofs = ofs & ((1 << header->obj_order) - 1);
641 len = min_t(u64, len, (1 << header->obj_order) - ofs);
642
643 if (segofs)
644 *segofs = ofs;
645
646 return len;
647}
648
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700649static int rbd_get_num_segments(struct rbd_image_header *header,
650 u64 ofs, u64 len)
651{
652 u64 start_seg = ofs >> header->obj_order;
653 u64 end_seg = (ofs + len - 1) >> header->obj_order;
654 return end_seg - start_seg + 1;
655}
656
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700658 * returns the size of an object in the image
659 */
660static u64 rbd_obj_bytes(struct rbd_image_header *header)
661{
662 return 1 << header->obj_order;
663}
664
665/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 * bio helpers
667 */
668
669static void bio_chain_put(struct bio *chain)
670{
671 struct bio *tmp;
672
673 while (chain) {
674 tmp = chain;
675 chain = chain->bi_next;
676 bio_put(tmp);
677 }
678}
679
680/*
681 * zeros a bio chain, starting at specific offset
682 */
683static void zero_bio_chain(struct bio *chain, int start_ofs)
684{
685 struct bio_vec *bv;
686 unsigned long flags;
687 void *buf;
688 int i;
689 int pos = 0;
690
691 while (chain) {
692 bio_for_each_segment(bv, chain, i) {
693 if (pos + bv->bv_len > start_ofs) {
694 int remainder = max(start_ofs - pos, 0);
695 buf = bvec_kmap_irq(bv, &flags);
696 memset(buf + remainder, 0,
697 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200698 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 }
700 pos += bv->bv_len;
701 }
702
703 chain = chain->bi_next;
704 }
705}
706
707/*
708 * bio_chain_clone - clone a chain of bios up to a certain length.
709 * might return a bio_pair that will need to be released.
710 */
711static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
712 struct bio_pair **bp,
713 int len, gfp_t gfpmask)
714{
715 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
716 int total = 0;
717
718 if (*bp) {
719 bio_pair_release(*bp);
720 *bp = NULL;
721 }
722
723 while (old_chain && (total < len)) {
724 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
725 if (!tmp)
726 goto err_out;
727
728 if (total + old_chain->bi_size > len) {
729 struct bio_pair *bp;
730
731 /*
732 * this split can only happen with a single paged bio,
733 * split_bio will BUG_ON if this is not the case
734 */
735 dout("bio_chain_clone split! total=%d remaining=%d"
736 "bi_size=%d\n",
737 (int)total, (int)len-total,
738 (int)old_chain->bi_size);
739
740 /* split the bio. We'll release it either in the next
741 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600742 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743 if (!bp)
744 goto err_out;
745
746 __bio_clone(tmp, &bp->bio1);
747
748 *next = &bp->bio2;
749 } else {
750 __bio_clone(tmp, old_chain);
751 *next = old_chain->bi_next;
752 }
753
754 tmp->bi_bdev = NULL;
755 gfpmask &= ~__GFP_WAIT;
756 tmp->bi_next = NULL;
757
758 if (!new_chain) {
759 new_chain = tail = tmp;
760 } else {
761 tail->bi_next = tmp;
762 tail = tmp;
763 }
764 old_chain = old_chain->bi_next;
765
766 total += tmp->bi_size;
767 }
768
769 BUG_ON(total < len);
770
771 if (tail)
772 tail->bi_next = NULL;
773
774 *old = old_chain;
775
776 return new_chain;
777
778err_out:
779 dout("bio_chain_clone with err\n");
780 bio_chain_put(new_chain);
781 return NULL;
782}
783
784/*
785 * helpers for osd request op vectors.
786 */
787static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
788 int num_ops,
789 int opcode,
790 u32 payload_len)
791{
792 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
793 GFP_NOIO);
794 if (!*ops)
795 return -ENOMEM;
796 (*ops)[0].op = opcode;
797 /*
798 * op extent offset and length will be set later on
799 * in calc_raw_layout()
800 */
801 (*ops)[0].payload_len = payload_len;
802 return 0;
803}
804
805static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
806{
807 kfree(ops);
808}
809
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700810static void rbd_coll_end_req_index(struct request *rq,
811 struct rbd_req_coll *coll,
812 int index,
813 int ret, u64 len)
814{
815 struct request_queue *q;
816 int min, max, i;
817
818 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
819 coll, index, ret, len);
820
821 if (!rq)
822 return;
823
824 if (!coll) {
825 blk_end_request(rq, ret, len);
826 return;
827 }
828
829 q = rq->q;
830
831 spin_lock_irq(q->queue_lock);
832 coll->status[index].done = 1;
833 coll->status[index].rc = ret;
834 coll->status[index].bytes = len;
835 max = min = coll->num_done;
836 while (max < coll->total && coll->status[max].done)
837 max++;
838
839 for (i = min; i<max; i++) {
840 __blk_end_request(rq, coll->status[i].rc,
841 coll->status[i].bytes);
842 coll->num_done++;
843 kref_put(&coll->kref, rbd_coll_release);
844 }
845 spin_unlock_irq(q->queue_lock);
846}
847
848static void rbd_coll_end_req(struct rbd_request *req,
849 int ret, u64 len)
850{
851 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
852}
853
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854/*
855 * Send ceph osd request
856 */
857static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500858 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859 struct ceph_snap_context *snapc,
860 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500861 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700862 struct bio *bio,
863 struct page **pages,
864 int num_pages,
865 int flags,
866 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700867 struct rbd_req_coll *coll,
868 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700869 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700870 struct ceph_msg *msg),
871 struct ceph_osd_request **linger_req,
872 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700873{
874 struct ceph_osd_request *req;
875 struct ceph_file_layout *layout;
876 int ret;
877 u64 bno;
878 struct timespec mtime = CURRENT_TIME;
879 struct rbd_request *req_data;
880 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600881 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700883 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700884 if (!req_data) {
885 if (coll)
886 rbd_coll_end_req_index(rq, coll, coll_index,
887 -ENOMEM, len);
888 return -ENOMEM;
889 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700890
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700891 if (coll) {
892 req_data->coll = coll;
893 req_data->coll_index = coll_index;
894 }
895
Alex Elderaded07e2012-07-03 16:01:18 -0500896 dout("rbd_do_request object_name=%s ofs=%lld len=%lld\n",
897 object_name, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898
Alex Elder0ce1a792012-07-03 16:01:18 -0500899 down_read(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700900
Alex Elder0ce1a792012-07-03 16:01:18 -0500901 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600902 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
903 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700904 if (!req) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500905 up_read(&rbd_dev->header_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700906 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700907 goto done_pages;
908 }
909
910 req->r_callback = rbd_cb;
911
912 req_data->rq = rq;
913 req_data->bio = bio;
914 req_data->pages = pages;
915 req_data->len = len;
916
917 req->r_priv = req_data;
918
919 reqhead = req->r_request->front.iov_base;
920 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
921
Alex Elderaded07e2012-07-03 16:01:18 -0500922 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700923 req->r_oid_len = strlen(req->r_oid);
924
925 layout = &req->r_file_layout;
926 memset(layout, 0, sizeof(*layout));
927 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
928 layout->fl_stripe_count = cpu_to_le32(1);
929 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500930 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600931 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
932 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700933
934 ceph_osdc_build_request(req, ofs, &len,
935 ops,
936 snapc,
937 &mtime,
938 req->r_oid, req->r_oid_len);
Alex Elder0ce1a792012-07-03 16:01:18 -0500939 up_read(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700940
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700941 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600942 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700943 *linger_req = req;
944 }
945
Alex Elder1dbb4392012-01-24 10:08:37 -0600946 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947 if (ret < 0)
948 goto done_err;
949
950 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600951 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700952 if (ver)
953 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700954 dout("reassert_ver=%lld\n",
955 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956 ceph_osdc_put_request(req);
957 }
958 return ret;
959
960done_err:
961 bio_chain_put(req_data->bio);
962 ceph_osdc_put_request(req);
963done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700964 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700965 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966 return ret;
967}
968
969/*
970 * Ceph osd op callback
971 */
972static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
973{
974 struct rbd_request *req_data = req->r_priv;
975 struct ceph_osd_reply_head *replyhead;
976 struct ceph_osd_op *op;
977 __s32 rc;
978 u64 bytes;
979 int read_op;
980
981 /* parse reply */
982 replyhead = msg->front.iov_base;
983 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
984 op = (void *)(replyhead + 1);
985 rc = le32_to_cpu(replyhead->result);
986 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -0500987 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700988
989 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
990
991 if (rc == -ENOENT && read_op) {
992 zero_bio_chain(req_data->bio, 0);
993 rc = 0;
994 } else if (rc == 0 && read_op && bytes < req_data->len) {
995 zero_bio_chain(req_data->bio, bytes);
996 bytes = req_data->len;
997 }
998
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700999 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000
1001 if (req_data->bio)
1002 bio_chain_put(req_data->bio);
1003
1004 ceph_osdc_put_request(req);
1005 kfree(req_data);
1006}
1007
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001008static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1009{
1010 ceph_osdc_put_request(req);
1011}
1012
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001013/*
1014 * Do a synchronous ceph osd operation
1015 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001016static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017 struct ceph_snap_context *snapc,
1018 u64 snapid,
1019 int opcode,
1020 int flags,
1021 struct ceph_osd_req_op *orig_ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001022 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001024 char *buf,
1025 struct ceph_osd_request **linger_req,
1026 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001027{
1028 int ret;
1029 struct page **pages;
1030 int num_pages;
1031 struct ceph_osd_req_op *ops = orig_ops;
1032 u32 payload_len;
1033
1034 num_pages = calc_pages_for(ofs , len);
1035 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001036 if (IS_ERR(pages))
1037 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001038
1039 if (!orig_ops) {
1040 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1041 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1042 if (ret < 0)
1043 goto done;
1044
1045 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1046 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1047 if (ret < 0)
1048 goto done_ops;
1049 }
1050 }
1051
Alex Elder0ce1a792012-07-03 16:01:18 -05001052 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001053 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054 pages, num_pages,
1055 flags,
1056 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001057 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001058 NULL,
1059 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001060 if (ret < 0)
1061 goto done_ops;
1062
1063 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1064 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1065
1066done_ops:
1067 if (!orig_ops)
1068 rbd_destroy_ops(ops);
1069done:
1070 ceph_release_page_vector(pages, num_pages);
1071 return ret;
1072}
1073
1074/*
1075 * Do an asynchronous ceph osd operation
1076 */
1077static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001078 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001079 struct ceph_snap_context *snapc,
1080 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001081 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001082 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001083 struct bio *bio,
1084 struct rbd_req_coll *coll,
1085 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001086{
1087 char *seg_name;
1088 u64 seg_ofs;
1089 u64 seg_len;
1090 int ret;
1091 struct ceph_osd_req_op *ops;
1092 u32 payload_len;
1093
1094 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1095 if (!seg_name)
1096 return -ENOMEM;
1097
1098 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001099 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001100 ofs, len,
1101 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102
1103 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1104
1105 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1106 if (ret < 0)
1107 goto done;
1108
1109 /* we've taken care of segment sizes earlier when we
1110 cloned the bios. We should never have a segment
1111 truncated at this point */
1112 BUG_ON(seg_len < len);
1113
1114 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1115 seg_name, seg_ofs, seg_len,
1116 bio,
1117 NULL, 0,
1118 flags,
1119 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001120 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001121 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001122
1123 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001124done:
1125 kfree(seg_name);
1126 return ret;
1127}
1128
1129/*
1130 * Request async osd write
1131 */
1132static int rbd_req_write(struct request *rq,
1133 struct rbd_device *rbd_dev,
1134 struct ceph_snap_context *snapc,
1135 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001136 struct bio *bio,
1137 struct rbd_req_coll *coll,
1138 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001139{
1140 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1141 CEPH_OSD_OP_WRITE,
1142 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001143 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144}
1145
1146/*
1147 * Request async osd read
1148 */
1149static int rbd_req_read(struct request *rq,
1150 struct rbd_device *rbd_dev,
1151 u64 snapid,
1152 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001153 struct bio *bio,
1154 struct rbd_req_coll *coll,
1155 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001156{
1157 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001158 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001159 CEPH_OSD_OP_READ,
1160 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001161 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162}
1163
1164/*
1165 * Request sync osd read
1166 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001167static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168 struct ceph_snap_context *snapc,
1169 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001170 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001172 char *buf,
1173 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001174{
Alex Elder0ce1a792012-07-03 16:01:18 -05001175 return rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001176 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001177 CEPH_OSD_OP_READ,
1178 CEPH_OSD_FLAG_READ,
1179 NULL,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001180 object_name, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181}
1182
1183/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001184 * Request sync osd watch
1185 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001186static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001187 u64 ver,
1188 u64 notify_id,
Alex Elderaded07e2012-07-03 16:01:18 -05001189 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001190{
1191 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001192 int ret;
1193
1194 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001195 if (ret < 0)
1196 return ret;
1197
Alex Elder0ce1a792012-07-03 16:01:18 -05001198 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001199 ops[0].watch.cookie = notify_id;
1200 ops[0].watch.flag = 0;
1201
Alex Elder0ce1a792012-07-03 16:01:18 -05001202 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderaded07e2012-07-03 16:01:18 -05001203 object_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001204 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001205 CEPH_OSD_FLAG_READ,
1206 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001207 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001208 rbd_simple_req_cb, 0, NULL);
1209
1210 rbd_destroy_ops(ops);
1211 return ret;
1212}
1213
1214static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1215{
Alex Elder0ce1a792012-07-03 16:01:18 -05001216 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001217 int rc;
1218
Alex Elder0ce1a792012-07-03 16:01:18 -05001219 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001220 return;
1221
Alex Elder0bed54d2012-07-03 16:01:18 -05001222 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n",
1223 rbd_dev->header_name, notify_id, (int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001224 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder0ce1a792012-07-03 16:01:18 -05001225 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001226 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001227 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001228 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001229 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230
Alex Elder0bed54d2012-07-03 16:01:18 -05001231 rbd_req_sync_notify_ack(rbd_dev, ver, notify_id, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001232}
1233
1234/*
1235 * Request sync osd watch
1236 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001237static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001238 const char *object_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001239 u64 ver)
1240{
1241 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001242 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001243
1244 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1245 if (ret < 0)
1246 return ret;
1247
1248 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001249 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001250 if (ret < 0)
1251 goto fail;
1252
1253 ops[0].watch.ver = cpu_to_le64(ver);
Alex Elder0ce1a792012-07-03 16:01:18 -05001254 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001255 ops[0].watch.flag = 1;
1256
Alex Elder0ce1a792012-07-03 16:01:18 -05001257 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001258 CEPH_NOSNAP,
1259 0,
1260 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1261 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001262 object_name, 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001263 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001264
1265 if (ret < 0)
1266 goto fail_event;
1267
1268 rbd_destroy_ops(ops);
1269 return 0;
1270
1271fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001272 ceph_osdc_cancel_event(rbd_dev->watch_event);
1273 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001274fail:
1275 rbd_destroy_ops(ops);
1276 return ret;
1277}
1278
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001279/*
1280 * Request sync osd unwatch
1281 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001282static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001283 const char *object_name)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001284{
1285 struct ceph_osd_req_op *ops;
1286
1287 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1288 if (ret < 0)
1289 return ret;
1290
1291 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001292 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001293 ops[0].watch.flag = 0;
1294
Alex Elder0ce1a792012-07-03 16:01:18 -05001295 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001296 CEPH_NOSNAP,
1297 0,
1298 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1299 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001300 object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001301
1302 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001303 ceph_osdc_cancel_event(rbd_dev->watch_event);
1304 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001305 return ret;
1306}
1307
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001308struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001309 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310};
1311
1312static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1313{
Alex Elder0ce1a792012-07-03 16:01:18 -05001314 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1315 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316 return;
1317
Alex Elder0ce1a792012-07-03 16:01:18 -05001318 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
Alex Elder0bed54d2012-07-03 16:01:18 -05001319 rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001320 notify_id, (int)opcode);
1321}
1322
1323/*
1324 * Request sync osd notify
1325 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001326static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001327 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001328{
1329 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001330 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001331 struct ceph_osd_event *event;
1332 struct rbd_notify_info info;
1333 int payload_len = sizeof(u32) + sizeof(u32);
1334 int ret;
1335
1336 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1337 if (ret < 0)
1338 return ret;
1339
Alex Elder0ce1a792012-07-03 16:01:18 -05001340 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341
1342 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1343 (void *)&info, &event);
1344 if (ret < 0)
1345 goto fail;
1346
1347 ops[0].watch.ver = 1;
1348 ops[0].watch.flag = 1;
1349 ops[0].watch.cookie = event->cookie;
1350 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1351 ops[0].watch.timeout = 12;
1352
Alex Elder0ce1a792012-07-03 16:01:18 -05001353 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001354 CEPH_NOSNAP,
1355 0,
1356 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1357 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001358 object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001359 if (ret < 0)
1360 goto fail_event;
1361
1362 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1363 dout("ceph_osdc_wait_event returned %d\n", ret);
1364 rbd_destroy_ops(ops);
1365 return 0;
1366
1367fail_event:
1368 ceph_osdc_cancel_event(event);
1369fail:
1370 rbd_destroy_ops(ops);
1371 return ret;
1372}
1373
1374/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001375 * Request sync osd read
1376 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001377static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001378 const char *object_name,
1379 const char *class_name,
1380 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001381 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001382 int len,
1383 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001384{
1385 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001386 int class_name_len = strlen(class_name);
1387 int method_name_len = strlen(method_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001388 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001389 class_name_len + method_name_len + len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001390 if (ret < 0)
1391 return ret;
1392
Alex Elderaded07e2012-07-03 16:01:18 -05001393 ops[0].cls.class_name = class_name;
1394 ops[0].cls.class_len = (__u8) class_name_len;
1395 ops[0].cls.method_name = method_name;
1396 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001397 ops[0].cls.argc = 0;
1398 ops[0].cls.indata = data;
1399 ops[0].cls.indata_len = len;
1400
Alex Elder0ce1a792012-07-03 16:01:18 -05001401 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001402 CEPH_NOSNAP,
1403 0,
1404 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1405 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001406 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001407
1408 rbd_destroy_ops(ops);
1409
1410 dout("cls_exec returned %d\n", ret);
1411 return ret;
1412}
1413
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001414static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1415{
1416 struct rbd_req_coll *coll =
1417 kzalloc(sizeof(struct rbd_req_coll) +
1418 sizeof(struct rbd_req_status) * num_reqs,
1419 GFP_ATOMIC);
1420
1421 if (!coll)
1422 return NULL;
1423 coll->total = num_reqs;
1424 kref_init(&coll->kref);
1425 return coll;
1426}
1427
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001428/*
1429 * block device queue callback
1430 */
1431static void rbd_rq_fn(struct request_queue *q)
1432{
1433 struct rbd_device *rbd_dev = q->queuedata;
1434 struct request *rq;
1435 struct bio_pair *bp = NULL;
1436
Alex Elder00f1f362012-02-07 12:03:36 -06001437 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001438 struct bio *bio;
1439 struct bio *rq_bio, *next_bio = NULL;
1440 bool do_write;
1441 int size, op_size = 0;
1442 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001443 int num_segs, cur_seg = 0;
1444 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001445
1446 /* peek at request from block layer */
1447 if (!rq)
1448 break;
1449
1450 dout("fetched request\n");
1451
1452 /* filter out block requests we don't understand */
1453 if ((rq->cmd_type != REQ_TYPE_FS)) {
1454 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001455 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001456 }
1457
1458 /* deduce our operation (read, write) */
1459 do_write = (rq_data_dir(rq) == WRITE);
1460
1461 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001462 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001463 rq_bio = rq->bio;
1464 if (do_write && rbd_dev->read_only) {
1465 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001466 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001467 }
1468
1469 spin_unlock_irq(q->queue_lock);
1470
1471 dout("%s 0x%x bytes at 0x%llx\n",
1472 do_write ? "write" : "read",
Alex Elder593a9e72012-02-07 12:03:37 -06001473 size, blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001475 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1476 coll = rbd_alloc_coll(num_segs);
1477 if (!coll) {
1478 spin_lock_irq(q->queue_lock);
1479 __blk_end_request_all(rq, -ENOMEM);
Alex Elder00f1f362012-02-07 12:03:36 -06001480 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001481 }
1482
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001483 do {
1484 /* a bio clone to be passed down to OSD req */
1485 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1486 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001487 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001488 ofs, size,
1489 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001490 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001491 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1492 op_size, GFP_ATOMIC);
1493 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001494 rbd_coll_end_req_index(rq, coll, cur_seg,
1495 -ENOMEM, op_size);
1496 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001497 }
1498
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001499
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001500 /* init OSD command: write or read */
1501 if (do_write)
1502 rbd_req_write(rq, rbd_dev,
1503 rbd_dev->header.snapc,
1504 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001505 op_size, bio,
1506 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001507 else
1508 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001509 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001510 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001511 op_size, bio,
1512 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001515 size -= op_size;
1516 ofs += op_size;
1517
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001518 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001519 rq_bio = next_bio;
1520 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522
1523 if (bp)
1524 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 spin_lock_irq(q->queue_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001526 }
1527}
1528
1529/*
1530 * a queue callback. Makes sure that we don't create a bio that spans across
1531 * multiple osd objects. One exception would be with a single page bios,
1532 * which we handle later at bio_chain_clone
1533 */
1534static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1535 struct bio_vec *bvec)
1536{
1537 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001538 unsigned int chunk_sectors;
1539 sector_t sector;
1540 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001541 int max;
1542
Alex Elder593a9e72012-02-07 12:03:37 -06001543 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1544 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1545 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1546
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001547 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001548 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001549 if (max < 0)
1550 max = 0; /* bio_add cannot handle a negative return */
1551 if (max <= bvec->bv_len && bio_sectors == 0)
1552 return bvec->bv_len;
1553 return max;
1554}
1555
1556static void rbd_free_disk(struct rbd_device *rbd_dev)
1557{
1558 struct gendisk *disk = rbd_dev->disk;
1559
1560 if (!disk)
1561 return;
1562
1563 rbd_header_free(&rbd_dev->header);
1564
1565 if (disk->flags & GENHD_FL_UP)
1566 del_gendisk(disk);
1567 if (disk->queue)
1568 blk_cleanup_queue(disk->queue);
1569 put_disk(disk);
1570}
1571
1572/*
1573 * reload the ondisk the header
1574 */
1575static int rbd_read_header(struct rbd_device *rbd_dev,
1576 struct rbd_image_header *header)
1577{
1578 ssize_t rc;
1579 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001580 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001581 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001582 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001583
Alex Elder00f1f362012-02-07 12:03:36 -06001584 /*
1585 * First reads the fixed-size header to determine the number
1586 * of snapshots, then re-reads it, along with all snapshot
1587 * records as well as their stored names.
1588 */
1589 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001591 dh = kmalloc(len, GFP_KERNEL);
1592 if (!dh)
1593 return -ENOMEM;
1594
1595 rc = rbd_req_sync_read(rbd_dev,
1596 NULL, CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001597 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001598 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001599 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001600 if (rc < 0)
1601 goto out_dh;
1602
1603 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001604 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001605 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001606 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001607 " for image %s\n",
1608 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001609 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001610 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001611
Alex Elder00f1f362012-02-07 12:03:36 -06001612 if (snap_count == header->total_snaps)
1613 break;
1614
1615 snap_count = header->total_snaps;
1616 len = sizeof (*dh) +
1617 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1618 header->snap_names_len;
1619
1620 rbd_header_free(header);
1621 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001622 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001623 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001624
1625out_dh:
1626 kfree(dh);
1627 return rc;
1628}
1629
1630/*
1631 * create a snapshot
1632 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001633static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001634 const char *snap_name,
1635 gfp_t gfp_flags)
1636{
1637 int name_len = strlen(snap_name);
1638 u64 new_snapid;
1639 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001640 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001641 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001642 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001643
1644 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001645 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001646 return -EINVAL;
1647
Alex Elder0ce1a792012-07-03 16:01:18 -05001648 monc = &rbd_dev->rbd_client->client->monc;
1649 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001650 dout("created snapid=%lld\n", new_snapid);
1651 if (ret < 0)
1652 return ret;
1653
1654 data = kmalloc(name_len + 16, gfp_flags);
1655 if (!data)
1656 return -ENOMEM;
1657
Sage Weil916d4d62011-05-12 16:10:50 -07001658 p = data;
1659 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001660
Sage Weil916d4d62011-05-12 16:10:50 -07001661 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1662 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001663
Alex Elder0bed54d2012-07-03 16:01:18 -05001664 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001665 "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001666 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001667
Sage Weil916d4d62011-05-12 16:10:50 -07001668 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001669
1670 if (ret < 0)
1671 return ret;
1672
Alex Elder0ce1a792012-07-03 16:01:18 -05001673 down_write(&rbd_dev->header_rwsem);
1674 rbd_dev->header.snapc->seq = new_snapid;
1675 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001676
1677 return 0;
1678bad:
1679 return -ERANGE;
1680}
1681
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001682static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1683{
1684 struct rbd_snap *snap;
1685
1686 while (!list_empty(&rbd_dev->snaps)) {
1687 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1688 __rbd_remove_snap_dev(rbd_dev, snap);
1689 }
1690}
1691
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001692/*
1693 * only read the first part of the ondisk header, without the snaps info
1694 */
Josh Durgin263c6ca2011-12-05 10:43:42 -08001695static int __rbd_refresh_header(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001696{
1697 int ret;
1698 struct rbd_image_header h;
1699 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001700 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001701
1702 ret = rbd_read_header(rbd_dev, &h);
1703 if (ret < 0)
1704 return ret;
1705
Sage Weil9db4b3e2011-04-19 22:49:06 -07001706 /* resized? */
Alex Elder593a9e72012-02-07 12:03:37 -06001707 set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001708
Josh Durginc6666012011-11-21 17:11:12 -08001709 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001710
1711 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001712 if (rbd_dev->header.total_snaps &&
1713 rbd_dev->header.snapc->snaps[0] == snap_seq)
1714 /* pointing at the head, will need to follow that
1715 if head moves */
1716 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001717
Alex Elder849b4262012-07-09 21:04:24 -05001718 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001719 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001720 kfree(rbd_dev->header.snap_names);
1721 kfree(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001722
1723 rbd_dev->header.total_snaps = h.total_snaps;
1724 rbd_dev->header.snapc = h.snapc;
1725 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001726 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001727 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001728 /* Free the extra copy of the object prefix */
1729 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1730 kfree(h.object_prefix);
1731
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001732 if (follow_seq)
1733 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1734 else
1735 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001736
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001737 ret = __rbd_init_snaps_header(rbd_dev);
1738
Josh Durginc6666012011-11-21 17:11:12 -08001739 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001740
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001741 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001742}
1743
1744static int rbd_init_disk(struct rbd_device *rbd_dev)
1745{
1746 struct gendisk *disk;
1747 struct request_queue *q;
1748 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001749 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001750 u64 total_size = 0;
1751
1752 /* contact OSD, request size info about the object being mapped */
1753 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1754 if (rc)
1755 return rc;
1756
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001757 /* no need to lock here, as rbd_dev is not registered yet */
1758 rc = __rbd_init_snaps_header(rbd_dev);
1759 if (rc)
1760 return rc;
1761
Josh Durgincc9d7342011-11-21 18:19:13 -08001762 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001763 if (rc)
1764 return rc;
1765
1766 /* create gendisk info */
1767 rc = -ENOMEM;
1768 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1769 if (!disk)
1770 goto out;
1771
Alex Elderf0f8cef2012-01-29 13:57:44 -06001772 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001773 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001774 disk->major = rbd_dev->major;
1775 disk->first_minor = 0;
1776 disk->fops = &rbd_bd_ops;
1777 disk->private_data = rbd_dev;
1778
1779 /* init rq */
1780 rc = -ENOMEM;
1781 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1782 if (!q)
1783 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001784
Alex Elder593a9e72012-02-07 12:03:37 -06001785 /* We use the default size, but let's be explicit about it. */
1786 blk_queue_physical_block_size(q, SECTOR_SIZE);
1787
Josh Durgin029bcbd2011-07-22 11:35:23 -07001788 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001789 segment_size = rbd_obj_bytes(&rbd_dev->header);
1790 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1791 blk_queue_max_segment_size(q, segment_size);
1792 blk_queue_io_min(q, segment_size);
1793 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001794
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001795 blk_queue_merge_bvec(q, rbd_merge_bvec);
1796 disk->queue = q;
1797
1798 q->queuedata = rbd_dev;
1799
1800 rbd_dev->disk = disk;
1801 rbd_dev->q = q;
1802
1803 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001804 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001805 add_disk(disk);
1806
1807 pr_info("%s: added with size 0x%llx\n",
1808 disk->disk_name, (unsigned long long)total_size);
1809 return 0;
1810
1811out_disk:
1812 put_disk(disk);
1813out:
1814 return rc;
1815}
1816
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001817/*
1818 sysfs
1819*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001820
Alex Elder593a9e72012-02-07 12:03:37 -06001821static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1822{
1823 return container_of(dev, struct rbd_device, dev);
1824}
1825
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001826static ssize_t rbd_size_show(struct device *dev,
1827 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001828{
Alex Elder593a9e72012-02-07 12:03:37 -06001829 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001830
1831 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832}
1833
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001834static ssize_t rbd_major_show(struct device *dev,
1835 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001836{
Alex Elder593a9e72012-02-07 12:03:37 -06001837 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001838
1839 return sprintf(buf, "%d\n", rbd_dev->major);
1840}
1841
1842static ssize_t rbd_client_id_show(struct device *dev,
1843 struct device_attribute *attr, char *buf)
1844{
Alex Elder593a9e72012-02-07 12:03:37 -06001845 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001846
Alex Elder1dbb4392012-01-24 10:08:37 -06001847 return sprintf(buf, "client%lld\n",
1848 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001849}
1850
1851static ssize_t rbd_pool_show(struct device *dev,
1852 struct device_attribute *attr, char *buf)
1853{
Alex Elder593a9e72012-02-07 12:03:37 -06001854 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001855
1856 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1857}
1858
Alex Elder9bb2f332012-07-12 10:46:35 -05001859static ssize_t rbd_pool_id_show(struct device *dev,
1860 struct device_attribute *attr, char *buf)
1861{
1862 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1863
1864 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1865}
1866
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001867static ssize_t rbd_name_show(struct device *dev,
1868 struct device_attribute *attr, char *buf)
1869{
Alex Elder593a9e72012-02-07 12:03:37 -06001870 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001871
Alex Elder0bed54d2012-07-03 16:01:18 -05001872 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001873}
1874
1875static ssize_t rbd_snap_show(struct device *dev,
1876 struct device_attribute *attr,
1877 char *buf)
1878{
Alex Elder593a9e72012-02-07 12:03:37 -06001879 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001880
1881 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1882}
1883
1884static ssize_t rbd_image_refresh(struct device *dev,
1885 struct device_attribute *attr,
1886 const char *buf,
1887 size_t size)
1888{
Alex Elder593a9e72012-02-07 12:03:37 -06001889 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001890 int rc;
1891 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001892
1893 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1894
Josh Durgin263c6ca2011-12-05 10:43:42 -08001895 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001896 if (rc < 0)
1897 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001898
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001899 mutex_unlock(&ctl_mutex);
1900 return ret;
1901}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001902
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001903static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1904static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1905static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1906static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001907static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001908static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1909static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1910static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1911static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001912
1913static struct attribute *rbd_attrs[] = {
1914 &dev_attr_size.attr,
1915 &dev_attr_major.attr,
1916 &dev_attr_client_id.attr,
1917 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001918 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001919 &dev_attr_name.attr,
1920 &dev_attr_current_snap.attr,
1921 &dev_attr_refresh.attr,
1922 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001923 NULL
1924};
1925
1926static struct attribute_group rbd_attr_group = {
1927 .attrs = rbd_attrs,
1928};
1929
1930static const struct attribute_group *rbd_attr_groups[] = {
1931 &rbd_attr_group,
1932 NULL
1933};
1934
1935static void rbd_sysfs_dev_release(struct device *dev)
1936{
1937}
1938
1939static struct device_type rbd_device_type = {
1940 .name = "rbd",
1941 .groups = rbd_attr_groups,
1942 .release = rbd_sysfs_dev_release,
1943};
1944
1945
1946/*
1947 sysfs - snapshots
1948*/
1949
1950static ssize_t rbd_snap_size_show(struct device *dev,
1951 struct device_attribute *attr,
1952 char *buf)
1953{
1954 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1955
Josh Durgin35915382011-12-05 18:25:13 -08001956 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001957}
1958
1959static ssize_t rbd_snap_id_show(struct device *dev,
1960 struct device_attribute *attr,
1961 char *buf)
1962{
1963 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1964
Josh Durgin35915382011-12-05 18:25:13 -08001965 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001966}
1967
1968static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1969static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1970
1971static struct attribute *rbd_snap_attrs[] = {
1972 &dev_attr_snap_size.attr,
1973 &dev_attr_snap_id.attr,
1974 NULL,
1975};
1976
1977static struct attribute_group rbd_snap_attr_group = {
1978 .attrs = rbd_snap_attrs,
1979};
1980
1981static void rbd_snap_dev_release(struct device *dev)
1982{
1983 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1984 kfree(snap->name);
1985 kfree(snap);
1986}
1987
1988static const struct attribute_group *rbd_snap_attr_groups[] = {
1989 &rbd_snap_attr_group,
1990 NULL
1991};
1992
1993static struct device_type rbd_snap_device_type = {
1994 .groups = rbd_snap_attr_groups,
1995 .release = rbd_snap_dev_release,
1996};
1997
1998static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1999 struct rbd_snap *snap)
2000{
2001 list_del(&snap->node);
2002 device_unregister(&snap->dev);
2003}
2004
2005static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2006 struct rbd_snap *snap,
2007 struct device *parent)
2008{
2009 struct device *dev = &snap->dev;
2010 int ret;
2011
2012 dev->type = &rbd_snap_device_type;
2013 dev->parent = parent;
2014 dev->release = rbd_snap_dev_release;
2015 dev_set_name(dev, "snap_%s", snap->name);
2016 ret = device_register(dev);
2017
2018 return ret;
2019}
2020
2021static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2022 int i, const char *name,
2023 struct rbd_snap **snapp)
2024{
2025 int ret;
2026 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2027 if (!snap)
2028 return -ENOMEM;
2029 snap->name = kstrdup(name, GFP_KERNEL);
2030 snap->size = rbd_dev->header.snap_sizes[i];
2031 snap->id = rbd_dev->header.snapc->snaps[i];
2032 if (device_is_registered(&rbd_dev->dev)) {
2033 ret = rbd_register_snap_dev(rbd_dev, snap,
2034 &rbd_dev->dev);
2035 if (ret < 0)
2036 goto err;
2037 }
2038 *snapp = snap;
2039 return 0;
2040err:
2041 kfree(snap->name);
2042 kfree(snap);
2043 return ret;
2044}
2045
2046/*
2047 * search for the previous snap in a null delimited string list
2048 */
2049const char *rbd_prev_snap_name(const char *name, const char *start)
2050{
2051 if (name < start + 2)
2052 return NULL;
2053
2054 name -= 2;
2055 while (*name) {
2056 if (name == start)
2057 return start;
2058 name--;
2059 }
2060 return name + 1;
2061}
2062
2063/*
2064 * compare the old list of snapshots that we have to what's in the header
2065 * and update it accordingly. Note that the header holds the snapshots
2066 * in a reverse order (from newest to oldest) and we need to go from
2067 * older to new so that we don't get a duplicate snap name when
2068 * doing the process (e.g., removed snapshot and recreated a new
2069 * one with the same name.
2070 */
2071static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2072{
2073 const char *name, *first_name;
2074 int i = rbd_dev->header.total_snaps;
2075 struct rbd_snap *snap, *old_snap = NULL;
2076 int ret;
2077 struct list_head *p, *n;
2078
2079 first_name = rbd_dev->header.snap_names;
2080 name = first_name + rbd_dev->header.snap_names_len;
2081
2082 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2083 u64 cur_id;
2084
2085 old_snap = list_entry(p, struct rbd_snap, node);
2086
2087 if (i)
2088 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2089
2090 if (!i || old_snap->id < cur_id) {
2091 /* old_snap->id was skipped, thus was removed */
2092 __rbd_remove_snap_dev(rbd_dev, old_snap);
2093 continue;
2094 }
2095 if (old_snap->id == cur_id) {
2096 /* we have this snapshot already */
2097 i--;
2098 name = rbd_prev_snap_name(name, first_name);
2099 continue;
2100 }
2101 for (; i > 0;
2102 i--, name = rbd_prev_snap_name(name, first_name)) {
2103 if (!name) {
2104 WARN_ON(1);
2105 return -EINVAL;
2106 }
2107 cur_id = rbd_dev->header.snapc->snaps[i];
2108 /* snapshot removal? handle it above */
2109 if (cur_id >= old_snap->id)
2110 break;
2111 /* a new snapshot */
2112 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2113 if (ret < 0)
2114 return ret;
2115
2116 /* note that we add it backward so using n and not p */
2117 list_add(&snap->node, n);
2118 p = &snap->node;
2119 }
2120 }
2121 /* we're done going over the old snap list, just add what's left */
2122 for (; i > 0; i--) {
2123 name = rbd_prev_snap_name(name, first_name);
2124 if (!name) {
2125 WARN_ON(1);
2126 return -EINVAL;
2127 }
2128 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2129 if (ret < 0)
2130 return ret;
2131 list_add(&snap->node, &rbd_dev->snaps);
2132 }
2133
2134 return 0;
2135}
2136
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002137static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2138{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002139 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002140 struct device *dev;
2141 struct rbd_snap *snap;
2142
2143 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2144 dev = &rbd_dev->dev;
2145
2146 dev->bus = &rbd_bus_type;
2147 dev->type = &rbd_device_type;
2148 dev->parent = &rbd_root_dev;
2149 dev->release = rbd_dev_release;
2150 dev_set_name(dev, "%d", rbd_dev->id);
2151 ret = device_register(dev);
2152 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002153 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002154
2155 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2156 ret = rbd_register_snap_dev(rbd_dev, snap,
2157 &rbd_dev->dev);
2158 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002159 break;
2160 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002161out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002162 mutex_unlock(&ctl_mutex);
2163 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002164}
2165
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2167{
2168 device_unregister(&rbd_dev->dev);
2169}
2170
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002171static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2172{
2173 int ret, rc;
2174
2175 do {
Alex Elder0bed54d2012-07-03 16:01:18 -05002176 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002177 rbd_dev->header.obj_version);
2178 if (ret == -ERANGE) {
2179 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08002180 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002181 mutex_unlock(&ctl_mutex);
2182 if (rc < 0)
2183 return rc;
2184 }
2185 } while (ret == -ERANGE);
2186
2187 return ret;
2188}
2189
Alex Elder1ddbe942012-01-29 13:57:44 -06002190static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2191
2192/*
Alex Elder499afd52012-02-02 08:13:29 -06002193 * Get a unique rbd identifier for the given new rbd_dev, and add
2194 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002195 */
Alex Elder499afd52012-02-02 08:13:29 -06002196static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002197{
Alex Elder499afd52012-02-02 08:13:29 -06002198 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2199
2200 spin_lock(&rbd_dev_list_lock);
2201 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2202 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002203}
Alex Elderb7f23c32012-01-29 13:57:43 -06002204
Alex Elder1ddbe942012-01-29 13:57:44 -06002205/*
Alex Elder499afd52012-02-02 08:13:29 -06002206 * Remove an rbd_dev from the global list, and record that its
2207 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002208 */
Alex Elder499afd52012-02-02 08:13:29 -06002209static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002210{
Alex Elderd184f6b2012-01-29 13:57:44 -06002211 struct list_head *tmp;
2212 int rbd_id = rbd_dev->id;
2213 int max_id;
2214
2215 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002216
2217 spin_lock(&rbd_dev_list_lock);
2218 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002219
2220 /*
2221 * If the id being "put" is not the current maximum, there
2222 * is nothing special we need to do.
2223 */
2224 if (rbd_id != atomic64_read(&rbd_id_max)) {
2225 spin_unlock(&rbd_dev_list_lock);
2226 return;
2227 }
2228
2229 /*
2230 * We need to update the current maximum id. Search the
2231 * list to find out what it is. We're more likely to find
2232 * the maximum at the end, so search the list backward.
2233 */
2234 max_id = 0;
2235 list_for_each_prev(tmp, &rbd_dev_list) {
2236 struct rbd_device *rbd_dev;
2237
2238 rbd_dev = list_entry(tmp, struct rbd_device, node);
2239 if (rbd_id > max_id)
2240 max_id = rbd_id;
2241 }
Alex Elder499afd52012-02-02 08:13:29 -06002242 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002243
Alex Elder1ddbe942012-01-29 13:57:44 -06002244 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002245 * The max id could have been updated by rbd_id_get(), in
2246 * which case it now accurately reflects the new maximum.
2247 * Be careful not to overwrite the maximum value in that
2248 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002249 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002250 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002251}
2252
Alex Eldera725f65e2012-02-02 08:13:30 -06002253/*
Alex Eldere28fff262012-02-02 08:13:30 -06002254 * Skips over white space at *buf, and updates *buf to point to the
2255 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002256 * the token (string of non-white space characters) found. Note
2257 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002258 */
2259static inline size_t next_token(const char **buf)
2260{
2261 /*
2262 * These are the characters that produce nonzero for
2263 * isspace() in the "C" and "POSIX" locales.
2264 */
2265 const char *spaces = " \f\n\r\t\v";
2266
2267 *buf += strspn(*buf, spaces); /* Find start of token */
2268
2269 return strcspn(*buf, spaces); /* Return token length */
2270}
2271
2272/*
2273 * Finds the next token in *buf, and if the provided token buffer is
2274 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002275 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2276 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002277 *
2278 * Returns the length of the token found (not including the '\0').
2279 * Return value will be 0 if no token is found, and it will be >=
2280 * token_size if the token would not fit.
2281 *
Alex Elder593a9e72012-02-07 12:03:37 -06002282 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002283 * found token. Note that this occurs even if the token buffer is
2284 * too small to hold it.
2285 */
2286static inline size_t copy_token(const char **buf,
2287 char *token,
2288 size_t token_size)
2289{
2290 size_t len;
2291
2292 len = next_token(buf);
2293 if (len < token_size) {
2294 memcpy(token, *buf, len);
2295 *(token + len) = '\0';
2296 }
2297 *buf += len;
2298
2299 return len;
2300}
2301
2302/*
Alex Elderea3352f2012-07-09 21:04:23 -05002303 * Finds the next token in *buf, dynamically allocates a buffer big
2304 * enough to hold a copy of it, and copies the token into the new
2305 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2306 * that a duplicate buffer is created even for a zero-length token.
2307 *
2308 * Returns a pointer to the newly-allocated duplicate, or a null
2309 * pointer if memory for the duplicate was not available. If
2310 * the lenp argument is a non-null pointer, the length of the token
2311 * (not including the '\0') is returned in *lenp.
2312 *
2313 * If successful, the *buf pointer will be updated to point beyond
2314 * the end of the found token.
2315 *
2316 * Note: uses GFP_KERNEL for allocation.
2317 */
2318static inline char *dup_token(const char **buf, size_t *lenp)
2319{
2320 char *dup;
2321 size_t len;
2322
2323 len = next_token(buf);
2324 dup = kmalloc(len + 1, GFP_KERNEL);
2325 if (!dup)
2326 return NULL;
2327
2328 memcpy(dup, *buf, len);
2329 *(dup + len) = '\0';
2330 *buf += len;
2331
2332 if (lenp)
2333 *lenp = len;
2334
2335 return dup;
2336}
2337
2338/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002339 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002340 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2341 * on the list of monitor addresses and other options provided via
2342 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002343 *
2344 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002345 */
2346static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2347 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002348 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002349 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002350 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002351 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002352{
Alex Elderd22f76e2012-07-12 10:46:35 -05002353 size_t len;
2354 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002355
2356 /* The first four tokens are required */
2357
Alex Elder7ef32142012-02-02 08:13:30 -06002358 len = next_token(&buf);
2359 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002360 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002361 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002362 *mon_addrs = buf;
2363
2364 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002365
Alex Eldere28fff262012-02-02 08:13:30 -06002366 len = copy_token(&buf, options, options_size);
2367 if (!len || len >= options_size)
2368 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002369
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002370 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002371 rbd_dev->pool_name = dup_token(&buf, NULL);
2372 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002373 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002374
Alex Elder0bed54d2012-07-03 16:01:18 -05002375 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2376 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002377 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002378
Alex Eldercb8627c2012-07-09 21:04:23 -05002379 /* Create the name of the header object */
2380
Alex Elder0bed54d2012-07-03 16:01:18 -05002381 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002382 + sizeof (RBD_SUFFIX),
2383 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002384 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002385 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002386 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002387
Alex Eldere28fff262012-02-02 08:13:30 -06002388 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002389 * The snapshot name is optional. If none is is supplied,
2390 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002391 */
Alex Elder820a5f32012-07-09 21:04:24 -05002392 rbd_dev->snap_name = dup_token(&buf, &len);
2393 if (!rbd_dev->snap_name)
2394 goto out_err;
2395 if (!len) {
2396 /* Replace the empty name with the default */
2397 kfree(rbd_dev->snap_name);
2398 rbd_dev->snap_name
2399 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2400 if (!rbd_dev->snap_name)
2401 goto out_err;
2402
Alex Eldere28fff262012-02-02 08:13:30 -06002403 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2404 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002405 }
Alex Eldere28fff262012-02-02 08:13:30 -06002406
Alex Eldera725f65e2012-02-02 08:13:30 -06002407 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002408
2409out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002410 kfree(rbd_dev->header_name);
2411 kfree(rbd_dev->image_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002412 kfree(rbd_dev->pool_name);
2413 rbd_dev->pool_name = NULL;
2414
2415 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002416}
2417
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002418static ssize_t rbd_add(struct bus_type *bus,
2419 const char *buf,
2420 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002421{
Alex Eldercb8627c2012-07-09 21:04:23 -05002422 char *options;
2423 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002424 const char *mon_addrs = NULL;
2425 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002426 struct ceph_osd_client *osdc;
2427 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002428
2429 if (!try_module_get(THIS_MODULE))
2430 return -ENODEV;
2431
Alex Elder27cc2592012-02-02 08:13:30 -06002432 options = kmalloc(count, GFP_KERNEL);
2433 if (!options)
2434 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002435 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2436 if (!rbd_dev)
2437 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002438
2439 /* static rbd_device initialization */
2440 spin_lock_init(&rbd_dev->lock);
2441 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002442 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002443 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002444
Josh Durginc6666012011-11-21 17:11:12 -08002445 init_rwsem(&rbd_dev->header_rwsem);
Alex Elder0e805a12012-01-11 19:42:15 -08002446
Alex Elderd184f6b2012-01-29 13:57:44 -06002447 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002448 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002449
Alex Eldera725f65e2012-02-02 08:13:30 -06002450 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002451 BUILD_BUG_ON(DEV_NAME_LEN
2452 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2453 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
Alex Eldere124a822012-01-29 13:57:44 -06002454
Alex Eldera725f65e2012-02-02 08:13:30 -06002455 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002456 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002457 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002458 if (rc)
2459 goto err_put_id;
2460
Alex Elder5214ecc2012-02-02 08:13:30 -06002461 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2462 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002463 if (IS_ERR(rbd_dev->rbd_client)) {
2464 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002465 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002466 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002467
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002468 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002469 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002470 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2471 if (rc < 0)
2472 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002473 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002474
2475 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002476 rc = register_blkdev(0, rbd_dev->name);
2477 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002478 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002479 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002480
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002481 rc = rbd_bus_add_dev(rbd_dev);
2482 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002483 goto err_out_blkdev;
2484
Alex Elder32eec682012-02-08 16:11:14 -06002485 /*
2486 * At this point cleanup in the event of an error is the job
2487 * of the sysfs code (initiated by rbd_bus_del_dev()).
2488 *
2489 * Set up and announce blkdev mapping.
2490 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002491 rc = rbd_init_disk(rbd_dev);
2492 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002493 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002494
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002495 rc = rbd_init_watch_dev(rbd_dev);
2496 if (rc)
2497 goto err_out_bus;
2498
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002499 return count;
2500
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002501err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002502 /* this will also clean up rest of rbd_dev stuff */
2503
2504 rbd_bus_del_dev(rbd_dev);
2505 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002506 return rc;
2507
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002508err_out_blkdev:
2509 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2510err_out_client:
2511 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002512err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002513 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002514 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002515 kfree(rbd_dev->header_name);
2516 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002517 kfree(rbd_dev->pool_name);
2518 }
Alex Elder499afd52012-02-02 08:13:29 -06002519 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002520err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002521 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002522 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002523
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002524 dout("Error adding device %s\n", buf);
2525 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002526
2527 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002528}
2529
2530static struct rbd_device *__rbd_get_dev(unsigned long id)
2531{
2532 struct list_head *tmp;
2533 struct rbd_device *rbd_dev;
2534
Alex Eldere124a822012-01-29 13:57:44 -06002535 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002536 list_for_each(tmp, &rbd_dev_list) {
2537 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a822012-01-29 13:57:44 -06002538 if (rbd_dev->id == id) {
2539 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002540 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002541 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002542 }
Alex Eldere124a822012-01-29 13:57:44 -06002543 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002544 return NULL;
2545}
2546
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002547static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002548{
Alex Elder593a9e72012-02-07 12:03:37 -06002549 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002550
Alex Elder1dbb4392012-01-24 10:08:37 -06002551 if (rbd_dev->watch_request) {
2552 struct ceph_client *client = rbd_dev->rbd_client->client;
2553
2554 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002555 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002556 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002557 if (rbd_dev->watch_event)
Alex Elder0bed54d2012-07-03 16:01:18 -05002558 rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002559
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002560 rbd_put_client(rbd_dev);
2561
2562 /* clean up and free blkdev */
2563 rbd_free_disk(rbd_dev);
2564 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002565
2566 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002567 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002568 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002569 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002570 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002571 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002572 kfree(rbd_dev);
2573
2574 /* release module ref */
2575 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002576}
2577
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002578static ssize_t rbd_remove(struct bus_type *bus,
2579 const char *buf,
2580 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002581{
2582 struct rbd_device *rbd_dev = NULL;
2583 int target_id, rc;
2584 unsigned long ul;
2585 int ret = count;
2586
2587 rc = strict_strtoul(buf, 10, &ul);
2588 if (rc)
2589 return rc;
2590
2591 /* convert to int; abort if we lost anything in the conversion */
2592 target_id = (int) ul;
2593 if (target_id != ul)
2594 return -EINVAL;
2595
2596 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2597
2598 rbd_dev = __rbd_get_dev(target_id);
2599 if (!rbd_dev) {
2600 ret = -ENOENT;
2601 goto done;
2602 }
2603
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002604 __rbd_remove_all_snaps(rbd_dev);
2605 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002606
2607done:
2608 mutex_unlock(&ctl_mutex);
2609 return ret;
2610}
2611
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002612static ssize_t rbd_snap_add(struct device *dev,
2613 struct device_attribute *attr,
2614 const char *buf,
2615 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002616{
Alex Elder593a9e72012-02-07 12:03:37 -06002617 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002618 int ret;
2619 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002620 if (!name)
2621 return -ENOMEM;
2622
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002623 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002624
2625 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2626
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002627 ret = rbd_header_add_snap(rbd_dev,
2628 name, GFP_KERNEL);
2629 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002630 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002631
Josh Durgin263c6ca2011-12-05 10:43:42 -08002632 ret = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002633 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002634 goto err_unlock;
2635
2636 /* shouldn't hold ctl_mutex when notifying.. notify might
2637 trigger a watch callback that would need to get that mutex */
2638 mutex_unlock(&ctl_mutex);
2639
2640 /* make a best effort, don't error if failed */
Alex Elder0bed54d2012-07-03 16:01:18 -05002641 rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002642
2643 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002644 kfree(name);
2645 return ret;
2646
2647err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002648 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002649 kfree(name);
2650 return ret;
2651}
2652
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002653/*
2654 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002655 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002656 */
2657static int rbd_sysfs_init(void)
2658{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002659 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002660
Alex Elderfed4c142012-02-07 12:03:36 -06002661 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002662 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002663 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002664
Alex Elderfed4c142012-02-07 12:03:36 -06002665 ret = bus_register(&rbd_bus_type);
2666 if (ret < 0)
2667 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002668
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002669 return ret;
2670}
2671
2672static void rbd_sysfs_cleanup(void)
2673{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002674 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002675 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002676}
2677
2678int __init rbd_init(void)
2679{
2680 int rc;
2681
2682 rc = rbd_sysfs_init();
2683 if (rc)
2684 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002685 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002686 return 0;
2687}
2688
2689void __exit rbd_exit(void)
2690{
2691 rbd_sysfs_cleanup();
2692}
2693
2694module_init(rbd_init);
2695module_exit(rbd_exit);
2696
2697MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2698MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2699MODULE_DESCRIPTION("rados block device");
2700
2701/* following authorship retained from original osdblk.c */
2702MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2703
2704MODULE_LICENSE("GPL");