blob: 74e6a33297064ba50f89b6f5acc2d963a3e22395 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070085 u32 total_snaps;
86
87 char *snap_names;
88 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070089
90 u64 obj_version;
91};
92
93struct rbd_options {
94 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095};
96
97/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060098 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099 */
100struct rbd_client {
101 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700102 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103 struct kref kref;
104 struct list_head node;
105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700110struct rbd_req_status {
111 int done;
112 int rc;
113 u64 bytes;
114};
115
116/*
117 * a collection of requests
118 */
119struct rbd_req_coll {
120 int total;
121 int num_done;
122 struct kref kref;
123 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700124};
125
Alex Elderf0f8cef2012-01-29 13:57:44 -0600126/*
127 * a single io request
128 */
129struct rbd_request {
130 struct request *rq; /* blk layer request */
131 struct bio *bio; /* cloned bio */
132 struct page **pages; /* list of used pages */
133 u64 len;
134 int coll_index;
135 struct rbd_req_coll *coll;
136};
137
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800138struct rbd_snap {
139 struct device dev;
140 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800141 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800142 struct list_head node;
143 u64 id;
144};
145
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146/*
147 * a single device
148 */
149struct rbd_device {
150 int id; /* blkdev unique id */
151
152 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156 struct rbd_client *rbd_client;
157
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
159
160 spinlock_t lock; /* queue lock */
161
162 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500163 char *image_name;
164 size_t image_name_len;
165 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500166 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500167 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700168
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700169 struct ceph_osd_event *watch_event;
170 struct ceph_osd_request *watch_request;
171
Josh Durginc6666012011-11-21 17:11:12 -0800172 /* protects updating the header */
173 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800174 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800176 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800177 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
182 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183
184 /* list of snapshots */
185 struct list_head snaps;
186
187 /* sysfs related */
188 struct device dev;
189};
190
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700191static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600192
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700193static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600194static DEFINE_SPINLOCK(rbd_dev_list_lock);
195
Alex Elder432b8582012-01-29 13:57:44 -0600196static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
205static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700206 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800207
Alex Elderf0f8cef2012-01-29 13:57:44 -0600208static ssize_t rbd_add(struct bus_type *bus, const char *buf,
209 size_t count);
210static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
211 size_t count);
212
213static struct bus_attribute rbd_bus_attrs[] = {
214 __ATTR(add, S_IWUSR, NULL, rbd_add),
215 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
216 __ATTR_NULL
217};
218
219static struct bus_type rbd_bus_type = {
220 .name = "rbd",
221 .bus_attrs = rbd_bus_attrs,
222};
223
224static void rbd_root_dev_release(struct device *dev)
225{
226}
227
228static struct device rbd_root_dev = {
229 .init_name = "rbd",
230 .release = rbd_root_dev_release,
231};
232
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800234static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
235{
236 return get_device(&rbd_dev->dev);
237}
238
239static void rbd_put_dev(struct rbd_device *rbd_dev)
240{
241 put_device(&rbd_dev->dev);
242}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700243
Josh Durgin263c6ca2011-12-05 10:43:42 -0800244static int __rbd_refresh_header(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700245
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700246static int rbd_open(struct block_device *bdev, fmode_t mode)
247{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600248 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800250 rbd_get_dev(rbd_dev);
251
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700252 set_device_ro(bdev, rbd_dev->read_only);
253
254 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
255 return -EROFS;
256
257 return 0;
258}
259
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800260static int rbd_release(struct gendisk *disk, fmode_t mode)
261{
262 struct rbd_device *rbd_dev = disk->private_data;
263
264 rbd_put_dev(rbd_dev);
265
266 return 0;
267}
268
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269static const struct block_device_operations rbd_bd_ops = {
270 .owner = THIS_MODULE,
271 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800272 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700273};
274
275/*
276 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500277 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700278 */
Alex Elder43ae4702012-07-03 16:01:18 -0500279static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700280 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281{
282 struct rbd_client *rbdc;
283 int ret = -ENOMEM;
284
285 dout("rbd_client_create\n");
286 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
287 if (!rbdc)
288 goto out_opt;
289
290 kref_init(&rbdc->kref);
291 INIT_LIST_HEAD(&rbdc->node);
292
Alex Elderbc534d82012-01-29 13:57:44 -0600293 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
294
Alex Elder43ae4702012-07-03 16:01:18 -0500295 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600297 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500298 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299
300 ret = ceph_open_session(rbdc->client);
301 if (ret < 0)
302 goto out_err;
303
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700304 rbdc->rbd_opts = rbd_opts;
305
Alex Elder432b8582012-01-29 13:57:44 -0600306 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600308 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309
Alex Elderbc534d82012-01-29 13:57:44 -0600310 mutex_unlock(&ctl_mutex);
311
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700312 dout("rbd_client_create created %p\n", rbdc);
313 return rbdc;
314
315out_err:
316 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600317out_mutex:
318 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700319 kfree(rbdc);
320out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500321 if (ceph_opts)
322 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400323 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700324}
325
326/*
327 * Find a ceph client with specific addr and configuration.
328 */
Alex Elder43ae4702012-07-03 16:01:18 -0500329static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700330{
331 struct rbd_client *client_node;
332
Alex Elder43ae4702012-07-03 16:01:18 -0500333 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 return NULL;
335
336 list_for_each_entry(client_node, &rbd_client_list, node)
Alex Elder43ae4702012-07-03 16:01:18 -0500337 if (!ceph_compare_options(ceph_opts, client_node->client))
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700338 return client_node;
339 return NULL;
340}
341
342/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700343 * mount options
344 */
345enum {
346 Opt_notify_timeout,
347 Opt_last_int,
348 /* int args above */
349 Opt_last_string,
350 /* string args above */
351};
352
Alex Elder43ae4702012-07-03 16:01:18 -0500353static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700354 {Opt_notify_timeout, "notify_timeout=%d"},
355 /* int args above */
356 /* string args above */
357 {-1, NULL}
358};
359
360static int parse_rbd_opts_token(char *c, void *private)
361{
Alex Elder43ae4702012-07-03 16:01:18 -0500362 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700363 substring_t argstr[MAX_OPT_ARGS];
364 int token, intval, ret;
365
Alex Elder43ae4702012-07-03 16:01:18 -0500366 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700367 if (token < 0)
368 return -EINVAL;
369
370 if (token < Opt_last_int) {
371 ret = match_int(&argstr[0], &intval);
372 if (ret < 0) {
373 pr_err("bad mount option arg (not int) "
374 "at '%s'\n", c);
375 return ret;
376 }
377 dout("got int token %d val %d\n", token, intval);
378 } else if (token > Opt_last_int && token < Opt_last_string) {
379 dout("got string token %d val %s\n", token,
380 argstr[0].from);
381 } else {
382 dout("got token %d\n", token);
383 }
384
385 switch (token) {
386 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500387 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700388 break;
389 default:
390 BUG_ON(token);
391 }
392 return 0;
393}
394
395/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700396 * Get a ceph client with specific addr and configuration, if one does
397 * not exist create it.
398 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600399static struct rbd_client *rbd_get_client(const char *mon_addr,
400 size_t mon_addr_len,
401 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700402{
403 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500404 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700405 struct rbd_options *rbd_opts;
406
407 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
408 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600409 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700410
411 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700412
Alex Elder43ae4702012-07-03 16:01:18 -0500413 ceph_opts = ceph_parse_options(options, mon_addr,
414 mon_addr + mon_addr_len,
415 parse_rbd_opts_token, rbd_opts);
416 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600417 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500418 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600419 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700420
Alex Elder432b8582012-01-29 13:57:44 -0600421 spin_lock(&rbd_client_list_lock);
Alex Elder43ae4702012-07-03 16:01:18 -0500422 rbdc = __rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700423 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600424 /* using an existing client */
425 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600426 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600427
Alex Elder43ae4702012-07-03 16:01:18 -0500428 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600429 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700430
Alex Elderd720bcb2012-02-02 08:13:30 -0600431 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432 }
Alex Elder432b8582012-01-29 13:57:44 -0600433 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700434
Alex Elder43ae4702012-07-03 16:01:18 -0500435 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600436
Alex Elderd720bcb2012-02-02 08:13:30 -0600437 if (IS_ERR(rbdc))
438 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439
Alex Elderd720bcb2012-02-02 08:13:30 -0600440 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441}
442
443/*
444 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600445 *
Alex Elder432b8582012-01-29 13:57:44 -0600446 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700447 */
448static void rbd_client_release(struct kref *kref)
449{
450 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
451
452 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500453 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700454 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500455 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456
457 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700458 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700459 kfree(rbdc);
460}
461
462/*
463 * Drop reference to ceph client node. If it's not referenced anymore, release
464 * it.
465 */
466static void rbd_put_client(struct rbd_device *rbd_dev)
467{
468 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
469 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470}
471
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700472/*
473 * Destroy requests collection
474 */
475static void rbd_coll_release(struct kref *kref)
476{
477 struct rbd_req_coll *coll =
478 container_of(kref, struct rbd_req_coll, kref);
479
480 dout("rbd_coll_release %p\n", coll);
481 kfree(coll);
482}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483
484/*
485 * Create a new header structure, translate header format from the on-disk
486 * header.
487 */
488static int rbd_header_from_disk(struct rbd_image_header *header,
489 struct rbd_image_header_ondisk *ondisk,
Xi Wang50f7c4c2012-04-20 15:49:44 -0500490 u32 allocated_snaps,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700491 gfp_t gfp_flags)
492{
Xi Wang50f7c4c2012-04-20 15:49:44 -0500493 u32 i, snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700494
Alex Elder21079782012-01-24 10:08:36 -0600495 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800496 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800497
Alex Elder00f1f362012-02-07 12:03:36 -0600498 snap_count = le32_to_cpu(ondisk->snap_count);
Xi Wang50f7c4c2012-04-20 15:49:44 -0500499 if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
500 / sizeof (*ondisk))
501 return -EINVAL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700502 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Yan, Zhengf9f9a192012-06-06 09:15:33 -0500503 snap_count * sizeof(u64),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700504 gfp_flags);
505 if (!header->snapc)
506 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600507
Alex Elder00f1f362012-02-07 12:03:36 -0600508 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700509 if (snap_count) {
510 header->snap_names = kmalloc(header->snap_names_len,
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500511 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700512 if (!header->snap_names)
513 goto err_snapc;
514 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500515 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700516 if (!header->snap_sizes)
517 goto err_names;
518 } else {
519 header->snap_names = NULL;
520 header->snap_sizes = NULL;
521 }
Alex Elder849b4262012-07-09 21:04:24 -0500522
523 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
524 gfp_flags);
525 if (!header->object_prefix)
526 goto err_sizes;
527
Alex Elderca1e49a2012-07-10 20:30:09 -0500528 memcpy(header->object_prefix, ondisk->block_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700529 sizeof(ondisk->block_name));
Alex Elder849b4262012-07-09 21:04:24 -0500530 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700531
532 header->image_size = le64_to_cpu(ondisk->image_size);
533 header->obj_order = ondisk->options.order;
534 header->crypt_type = ondisk->options.crypt_type;
535 header->comp_type = ondisk->options.comp_type;
536
537 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500538 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700539 header->snapc->num_snaps = snap_count;
540 header->total_snaps = snap_count;
541
Alex Elder21079782012-01-24 10:08:36 -0600542 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700543 for (i = 0; i < snap_count; i++) {
544 header->snapc->snaps[i] =
545 le64_to_cpu(ondisk->snaps[i].id);
546 header->snap_sizes[i] =
547 le64_to_cpu(ondisk->snaps[i].image_size);
548 }
549
550 /* copy snapshot names */
551 memcpy(header->snap_names, &ondisk->snaps[i],
552 header->snap_names_len);
553 }
554
555 return 0;
556
Alex Elder849b4262012-07-09 21:04:24 -0500557err_sizes:
558 kfree(header->snap_sizes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700559err_names:
560 kfree(header->snap_names);
561err_snapc:
562 kfree(header->snapc);
Alex Elder00f1f362012-02-07 12:03:36 -0600563 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700564}
565
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700566static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
567 u64 *seq, u64 *size)
568{
569 int i;
570 char *p = header->snap_names;
571
Alex Elder00f1f362012-02-07 12:03:36 -0600572 for (i = 0; i < header->total_snaps; i++) {
573 if (!strcmp(snap_name, p)) {
574
575 /* Found it. Pass back its id and/or size */
576
577 if (seq)
578 *seq = header->snapc->snaps[i];
579 if (size)
580 *size = header->snap_sizes[i];
581 return i;
582 }
583 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584 }
Alex Elder00f1f362012-02-07 12:03:36 -0600585 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586}
587
Alex Elder0ce1a792012-07-03 16:01:18 -0500588static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700589{
Alex Elder78dc4472012-07-19 08:49:18 -0500590 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700591
Alex Elder0ce1a792012-07-03 16:01:18 -0500592 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700593
Alex Elder0ce1a792012-07-03 16:01:18 -0500594 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800595 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500596 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800597 rbd_dev->snap_exists = false;
Alex Elder0ce1a792012-07-03 16:01:18 -0500598 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500600 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700601 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500602 u64 snap_id = 0;
603
604 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
605 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606 if (ret < 0)
607 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500608 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800609 rbd_dev->snap_exists = true;
Alex Elder0ce1a792012-07-03 16:01:18 -0500610 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700611 }
612
613 ret = 0;
614done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500615 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616 return ret;
617}
618
619static void rbd_header_free(struct rbd_image_header *header)
620{
Alex Elder849b4262012-07-09 21:04:24 -0500621 kfree(header->object_prefix);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622 kfree(header->snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -0500623 kfree(header->snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -0800624 ceph_put_snap_context(header->snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700625}
626
627/*
628 * get the actual striped segment name, offset and length
629 */
630static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500631 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632 u64 ofs, u64 len,
633 char *seg_name, u64 *segofs)
634{
635 u64 seg = ofs >> header->obj_order;
636
637 if (seg_name)
638 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500639 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700640
641 ofs = ofs & ((1 << header->obj_order) - 1);
642 len = min_t(u64, len, (1 << header->obj_order) - ofs);
643
644 if (segofs)
645 *segofs = ofs;
646
647 return len;
648}
649
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700650static int rbd_get_num_segments(struct rbd_image_header *header,
651 u64 ofs, u64 len)
652{
653 u64 start_seg = ofs >> header->obj_order;
654 u64 end_seg = (ofs + len - 1) >> header->obj_order;
655 return end_seg - start_seg + 1;
656}
657
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700659 * returns the size of an object in the image
660 */
661static u64 rbd_obj_bytes(struct rbd_image_header *header)
662{
663 return 1 << header->obj_order;
664}
665
666/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 * bio helpers
668 */
669
670static void bio_chain_put(struct bio *chain)
671{
672 struct bio *tmp;
673
674 while (chain) {
675 tmp = chain;
676 chain = chain->bi_next;
677 bio_put(tmp);
678 }
679}
680
681/*
682 * zeros a bio chain, starting at specific offset
683 */
684static void zero_bio_chain(struct bio *chain, int start_ofs)
685{
686 struct bio_vec *bv;
687 unsigned long flags;
688 void *buf;
689 int i;
690 int pos = 0;
691
692 while (chain) {
693 bio_for_each_segment(bv, chain, i) {
694 if (pos + bv->bv_len > start_ofs) {
695 int remainder = max(start_ofs - pos, 0);
696 buf = bvec_kmap_irq(bv, &flags);
697 memset(buf + remainder, 0,
698 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200699 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700 }
701 pos += bv->bv_len;
702 }
703
704 chain = chain->bi_next;
705 }
706}
707
708/*
709 * bio_chain_clone - clone a chain of bios up to a certain length.
710 * might return a bio_pair that will need to be released.
711 */
712static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
713 struct bio_pair **bp,
714 int len, gfp_t gfpmask)
715{
716 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
717 int total = 0;
718
719 if (*bp) {
720 bio_pair_release(*bp);
721 *bp = NULL;
722 }
723
724 while (old_chain && (total < len)) {
725 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
726 if (!tmp)
727 goto err_out;
728
729 if (total + old_chain->bi_size > len) {
730 struct bio_pair *bp;
731
732 /*
733 * this split can only happen with a single paged bio,
734 * split_bio will BUG_ON if this is not the case
735 */
736 dout("bio_chain_clone split! total=%d remaining=%d"
737 "bi_size=%d\n",
738 (int)total, (int)len-total,
739 (int)old_chain->bi_size);
740
741 /* split the bio. We'll release it either in the next
742 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600743 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744 if (!bp)
745 goto err_out;
746
747 __bio_clone(tmp, &bp->bio1);
748
749 *next = &bp->bio2;
750 } else {
751 __bio_clone(tmp, old_chain);
752 *next = old_chain->bi_next;
753 }
754
755 tmp->bi_bdev = NULL;
756 gfpmask &= ~__GFP_WAIT;
757 tmp->bi_next = NULL;
758
759 if (!new_chain) {
760 new_chain = tail = tmp;
761 } else {
762 tail->bi_next = tmp;
763 tail = tmp;
764 }
765 old_chain = old_chain->bi_next;
766
767 total += tmp->bi_size;
768 }
769
770 BUG_ON(total < len);
771
772 if (tail)
773 tail->bi_next = NULL;
774
775 *old = old_chain;
776
777 return new_chain;
778
779err_out:
780 dout("bio_chain_clone with err\n");
781 bio_chain_put(new_chain);
782 return NULL;
783}
784
785/*
786 * helpers for osd request op vectors.
787 */
788static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
789 int num_ops,
790 int opcode,
791 u32 payload_len)
792{
793 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
794 GFP_NOIO);
795 if (!*ops)
796 return -ENOMEM;
797 (*ops)[0].op = opcode;
798 /*
799 * op extent offset and length will be set later on
800 * in calc_raw_layout()
801 */
802 (*ops)[0].payload_len = payload_len;
803 return 0;
804}
805
806static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
807{
808 kfree(ops);
809}
810
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700811static void rbd_coll_end_req_index(struct request *rq,
812 struct rbd_req_coll *coll,
813 int index,
814 int ret, u64 len)
815{
816 struct request_queue *q;
817 int min, max, i;
818
819 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
820 coll, index, ret, len);
821
822 if (!rq)
823 return;
824
825 if (!coll) {
826 blk_end_request(rq, ret, len);
827 return;
828 }
829
830 q = rq->q;
831
832 spin_lock_irq(q->queue_lock);
833 coll->status[index].done = 1;
834 coll->status[index].rc = ret;
835 coll->status[index].bytes = len;
836 max = min = coll->num_done;
837 while (max < coll->total && coll->status[max].done)
838 max++;
839
840 for (i = min; i<max; i++) {
841 __blk_end_request(rq, coll->status[i].rc,
842 coll->status[i].bytes);
843 coll->num_done++;
844 kref_put(&coll->kref, rbd_coll_release);
845 }
846 spin_unlock_irq(q->queue_lock);
847}
848
849static void rbd_coll_end_req(struct rbd_request *req,
850 int ret, u64 len)
851{
852 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
853}
854
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700855/*
856 * Send ceph osd request
857 */
858static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500859 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700860 struct ceph_snap_context *snapc,
861 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500862 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700863 struct bio *bio,
864 struct page **pages,
865 int num_pages,
866 int flags,
867 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700868 struct rbd_req_coll *coll,
869 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700870 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700871 struct ceph_msg *msg),
872 struct ceph_osd_request **linger_req,
873 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874{
875 struct ceph_osd_request *req;
876 struct ceph_file_layout *layout;
877 int ret;
878 u64 bno;
879 struct timespec mtime = CURRENT_TIME;
880 struct rbd_request *req_data;
881 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600882 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700883
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700885 if (!req_data) {
886 if (coll)
887 rbd_coll_end_req_index(rq, coll, coll_index,
888 -ENOMEM, len);
889 return -ENOMEM;
890 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700891
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700892 if (coll) {
893 req_data->coll = coll;
894 req_data->coll_index = coll_index;
895 }
896
Alex Elderaded07e2012-07-03 16:01:18 -0500897 dout("rbd_do_request object_name=%s ofs=%lld len=%lld\n",
898 object_name, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700899
Alex Elder0ce1a792012-07-03 16:01:18 -0500900 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600901 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
902 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700903 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700904 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700905 goto done_pages;
906 }
907
908 req->r_callback = rbd_cb;
909
910 req_data->rq = rq;
911 req_data->bio = bio;
912 req_data->pages = pages;
913 req_data->len = len;
914
915 req->r_priv = req_data;
916
917 reqhead = req->r_request->front.iov_base;
918 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
919
Alex Elderaded07e2012-07-03 16:01:18 -0500920 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700921 req->r_oid_len = strlen(req->r_oid);
922
923 layout = &req->r_file_layout;
924 memset(layout, 0, sizeof(*layout));
925 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
926 layout->fl_stripe_count = cpu_to_le32(1);
927 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500928 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600929 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
930 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700931
932 ceph_osdc_build_request(req, ofs, &len,
933 ops,
934 snapc,
935 &mtime,
936 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700937
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700938 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600939 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700940 *linger_req = req;
941 }
942
Alex Elder1dbb4392012-01-24 10:08:37 -0600943 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700944 if (ret < 0)
945 goto done_err;
946
947 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600948 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700949 if (ver)
950 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700951 dout("reassert_ver=%lld\n",
952 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700953 ceph_osdc_put_request(req);
954 }
955 return ret;
956
957done_err:
958 bio_chain_put(req_data->bio);
959 ceph_osdc_put_request(req);
960done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700961 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963 return ret;
964}
965
966/*
967 * Ceph osd op callback
968 */
969static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
970{
971 struct rbd_request *req_data = req->r_priv;
972 struct ceph_osd_reply_head *replyhead;
973 struct ceph_osd_op *op;
974 __s32 rc;
975 u64 bytes;
976 int read_op;
977
978 /* parse reply */
979 replyhead = msg->front.iov_base;
980 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
981 op = (void *)(replyhead + 1);
982 rc = le32_to_cpu(replyhead->result);
983 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -0500984 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985
986 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
987
988 if (rc == -ENOENT && read_op) {
989 zero_bio_chain(req_data->bio, 0);
990 rc = 0;
991 } else if (rc == 0 && read_op && bytes < req_data->len) {
992 zero_bio_chain(req_data->bio, bytes);
993 bytes = req_data->len;
994 }
995
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700996 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700997
998 if (req_data->bio)
999 bio_chain_put(req_data->bio);
1000
1001 ceph_osdc_put_request(req);
1002 kfree(req_data);
1003}
1004
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001005static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1006{
1007 ceph_osdc_put_request(req);
1008}
1009
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001010/*
1011 * Do a synchronous ceph osd operation
1012 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001013static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001014 struct ceph_snap_context *snapc,
1015 u64 snapid,
1016 int opcode,
1017 int flags,
1018 struct ceph_osd_req_op *orig_ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001019 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001020 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001021 char *buf,
1022 struct ceph_osd_request **linger_req,
1023 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024{
1025 int ret;
1026 struct page **pages;
1027 int num_pages;
1028 struct ceph_osd_req_op *ops = orig_ops;
1029 u32 payload_len;
1030
1031 num_pages = calc_pages_for(ofs , len);
1032 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001033 if (IS_ERR(pages))
1034 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035
1036 if (!orig_ops) {
1037 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1038 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1039 if (ret < 0)
1040 goto done;
1041
1042 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1043 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1044 if (ret < 0)
1045 goto done_ops;
1046 }
1047 }
1048
Alex Elder0ce1a792012-07-03 16:01:18 -05001049 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001050 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001051 pages, num_pages,
1052 flags,
1053 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001054 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001055 NULL,
1056 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001057 if (ret < 0)
1058 goto done_ops;
1059
1060 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1061 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1062
1063done_ops:
1064 if (!orig_ops)
1065 rbd_destroy_ops(ops);
1066done:
1067 ceph_release_page_vector(pages, num_pages);
1068 return ret;
1069}
1070
1071/*
1072 * Do an asynchronous ceph osd operation
1073 */
1074static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001075 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001076 struct ceph_snap_context *snapc,
1077 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001078 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001079 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001080 struct bio *bio,
1081 struct rbd_req_coll *coll,
1082 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001083{
1084 char *seg_name;
1085 u64 seg_ofs;
1086 u64 seg_len;
1087 int ret;
1088 struct ceph_osd_req_op *ops;
1089 u32 payload_len;
1090
1091 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1092 if (!seg_name)
1093 return -ENOMEM;
1094
1095 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001096 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001097 ofs, len,
1098 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001099
1100 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1101
1102 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1103 if (ret < 0)
1104 goto done;
1105
1106 /* we've taken care of segment sizes earlier when we
1107 cloned the bios. We should never have a segment
1108 truncated at this point */
1109 BUG_ON(seg_len < len);
1110
1111 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1112 seg_name, seg_ofs, seg_len,
1113 bio,
1114 NULL, 0,
1115 flags,
1116 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001117 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001118 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001119
1120 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001121done:
1122 kfree(seg_name);
1123 return ret;
1124}
1125
1126/*
1127 * Request async osd write
1128 */
1129static int rbd_req_write(struct request *rq,
1130 struct rbd_device *rbd_dev,
1131 struct ceph_snap_context *snapc,
1132 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001133 struct bio *bio,
1134 struct rbd_req_coll *coll,
1135 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001136{
1137 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1138 CEPH_OSD_OP_WRITE,
1139 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001140 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001141}
1142
1143/*
1144 * Request async osd read
1145 */
1146static int rbd_req_read(struct request *rq,
1147 struct rbd_device *rbd_dev,
1148 u64 snapid,
1149 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001150 struct bio *bio,
1151 struct rbd_req_coll *coll,
1152 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153{
1154 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001155 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001156 CEPH_OSD_OP_READ,
1157 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001158 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001159}
1160
1161/*
1162 * Request sync osd read
1163 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001164static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165 struct ceph_snap_context *snapc,
1166 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001167 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001169 char *buf,
1170 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171{
Alex Elder0ce1a792012-07-03 16:01:18 -05001172 return rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001173 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001174 CEPH_OSD_OP_READ,
1175 CEPH_OSD_FLAG_READ,
1176 NULL,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001177 object_name, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178}
1179
1180/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001181 * Request sync osd watch
1182 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001183static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001184 u64 ver,
1185 u64 notify_id,
Alex Elderaded07e2012-07-03 16:01:18 -05001186 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001187{
1188 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001189 int ret;
1190
1191 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001192 if (ret < 0)
1193 return ret;
1194
Josh Durgina71b8912011-12-05 18:10:44 -08001195 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001196 ops[0].watch.cookie = notify_id;
1197 ops[0].watch.flag = 0;
1198
Alex Elder0ce1a792012-07-03 16:01:18 -05001199 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderaded07e2012-07-03 16:01:18 -05001200 object_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001201 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001202 CEPH_OSD_FLAG_READ,
1203 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001204 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001205 rbd_simple_req_cb, 0, NULL);
1206
1207 rbd_destroy_ops(ops);
1208 return ret;
1209}
1210
1211static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1212{
Alex Elder0ce1a792012-07-03 16:01:18 -05001213 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001214 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001215 int rc;
1216
Alex Elder0ce1a792012-07-03 16:01:18 -05001217 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001218 return;
1219
Alex Elder0bed54d2012-07-03 16:01:18 -05001220 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n",
1221 rbd_dev->header_name, notify_id, (int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001222 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder0ce1a792012-07-03 16:01:18 -05001223 rc = __rbd_refresh_header(rbd_dev);
Josh Durgina71b8912011-12-05 18:10:44 -08001224 hver = rbd_dev->header.obj_version;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001225 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001226 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001227 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001228 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001229
Josh Durgina71b8912011-12-05 18:10:44 -08001230 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001231}
1232
1233/*
1234 * Request sync osd watch
1235 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001236static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001237 const char *object_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001238 u64 ver)
1239{
1240 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001241 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001242
1243 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1244 if (ret < 0)
1245 return ret;
1246
1247 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001248 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001249 if (ret < 0)
1250 goto fail;
1251
1252 ops[0].watch.ver = cpu_to_le64(ver);
Alex Elder0ce1a792012-07-03 16:01:18 -05001253 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001254 ops[0].watch.flag = 1;
1255
Alex Elder0ce1a792012-07-03 16:01:18 -05001256 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001257 CEPH_NOSNAP,
1258 0,
1259 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1260 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001261 object_name, 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001262 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001263
1264 if (ret < 0)
1265 goto fail_event;
1266
1267 rbd_destroy_ops(ops);
1268 return 0;
1269
1270fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001271 ceph_osdc_cancel_event(rbd_dev->watch_event);
1272 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001273fail:
1274 rbd_destroy_ops(ops);
1275 return ret;
1276}
1277
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001278/*
1279 * Request sync osd unwatch
1280 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001281static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001282 const char *object_name)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001283{
1284 struct ceph_osd_req_op *ops;
1285
1286 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1287 if (ret < 0)
1288 return ret;
1289
1290 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001291 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001292 ops[0].watch.flag = 0;
1293
Alex Elder0ce1a792012-07-03 16:01:18 -05001294 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001295 CEPH_NOSNAP,
1296 0,
1297 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1298 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001299 object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001300
1301 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001302 ceph_osdc_cancel_event(rbd_dev->watch_event);
1303 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001304 return ret;
1305}
1306
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001307struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001308 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001309};
1310
1311static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1312{
Alex Elder0ce1a792012-07-03 16:01:18 -05001313 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1314 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001315 return;
1316
Alex Elder0ce1a792012-07-03 16:01:18 -05001317 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
Alex Elder0bed54d2012-07-03 16:01:18 -05001318 rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001319 notify_id, (int)opcode);
1320}
1321
1322/*
1323 * Request sync osd notify
1324 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001325static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001326 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327{
1328 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001329 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330 struct ceph_osd_event *event;
1331 struct rbd_notify_info info;
1332 int payload_len = sizeof(u32) + sizeof(u32);
1333 int ret;
1334
1335 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1336 if (ret < 0)
1337 return ret;
1338
Alex Elder0ce1a792012-07-03 16:01:18 -05001339 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001340
1341 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1342 (void *)&info, &event);
1343 if (ret < 0)
1344 goto fail;
1345
1346 ops[0].watch.ver = 1;
1347 ops[0].watch.flag = 1;
1348 ops[0].watch.cookie = event->cookie;
1349 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1350 ops[0].watch.timeout = 12;
1351
Alex Elder0ce1a792012-07-03 16:01:18 -05001352 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001353 CEPH_NOSNAP,
1354 0,
1355 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1356 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001357 object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001358 if (ret < 0)
1359 goto fail_event;
1360
1361 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1362 dout("ceph_osdc_wait_event returned %d\n", ret);
1363 rbd_destroy_ops(ops);
1364 return 0;
1365
1366fail_event:
1367 ceph_osdc_cancel_event(event);
1368fail:
1369 rbd_destroy_ops(ops);
1370 return ret;
1371}
1372
1373/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001374 * Request sync osd read
1375 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001376static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001377 const char *object_name,
1378 const char *class_name,
1379 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001380 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001381 int len,
1382 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001383{
1384 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001385 int class_name_len = strlen(class_name);
1386 int method_name_len = strlen(method_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001387 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001388 class_name_len + method_name_len + len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001389 if (ret < 0)
1390 return ret;
1391
Alex Elderaded07e2012-07-03 16:01:18 -05001392 ops[0].cls.class_name = class_name;
1393 ops[0].cls.class_len = (__u8) class_name_len;
1394 ops[0].cls.method_name = method_name;
1395 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001396 ops[0].cls.argc = 0;
1397 ops[0].cls.indata = data;
1398 ops[0].cls.indata_len = len;
1399
Alex Elder0ce1a792012-07-03 16:01:18 -05001400 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001401 CEPH_NOSNAP,
1402 0,
1403 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1404 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001405 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001406
1407 rbd_destroy_ops(ops);
1408
1409 dout("cls_exec returned %d\n", ret);
1410 return ret;
1411}
1412
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001413static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1414{
1415 struct rbd_req_coll *coll =
1416 kzalloc(sizeof(struct rbd_req_coll) +
1417 sizeof(struct rbd_req_status) * num_reqs,
1418 GFP_ATOMIC);
1419
1420 if (!coll)
1421 return NULL;
1422 coll->total = num_reqs;
1423 kref_init(&coll->kref);
1424 return coll;
1425}
1426
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001427/*
1428 * block device queue callback
1429 */
1430static void rbd_rq_fn(struct request_queue *q)
1431{
1432 struct rbd_device *rbd_dev = q->queuedata;
1433 struct request *rq;
1434 struct bio_pair *bp = NULL;
1435
Alex Elder00f1f362012-02-07 12:03:36 -06001436 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001437 struct bio *bio;
1438 struct bio *rq_bio, *next_bio = NULL;
1439 bool do_write;
1440 int size, op_size = 0;
1441 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001442 int num_segs, cur_seg = 0;
1443 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001444 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001445
1446 /* peek at request from block layer */
1447 if (!rq)
1448 break;
1449
1450 dout("fetched request\n");
1451
1452 /* filter out block requests we don't understand */
1453 if ((rq->cmd_type != REQ_TYPE_FS)) {
1454 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001455 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001456 }
1457
1458 /* deduce our operation (read, write) */
1459 do_write = (rq_data_dir(rq) == WRITE);
1460
1461 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001462 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001463 rq_bio = rq->bio;
1464 if (do_write && rbd_dev->read_only) {
1465 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001466 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001467 }
1468
1469 spin_unlock_irq(q->queue_lock);
1470
Josh Durgind1d25642011-12-05 14:03:05 -08001471 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001472
Josh Durgind1d25642011-12-05 14:03:05 -08001473 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001474 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001475 dout("request for non-existent snapshot");
1476 spin_lock_irq(q->queue_lock);
1477 __blk_end_request_all(rq, -ENXIO);
1478 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001479 }
1480
Josh Durgind1d25642011-12-05 14:03:05 -08001481 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1482
1483 up_read(&rbd_dev->header_rwsem);
1484
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001485 dout("%s 0x%x bytes at 0x%llx\n",
1486 do_write ? "write" : "read",
Alex Elder593a9e72012-02-07 12:03:37 -06001487 size, blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001488
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001489 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1490 coll = rbd_alloc_coll(num_segs);
1491 if (!coll) {
1492 spin_lock_irq(q->queue_lock);
1493 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001494 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001495 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001496 }
1497
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001498 do {
1499 /* a bio clone to be passed down to OSD req */
1500 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1501 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001502 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503 ofs, size,
1504 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001505 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001506 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1507 op_size, GFP_ATOMIC);
1508 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001509 rbd_coll_end_req_index(rq, coll, cur_seg,
1510 -ENOMEM, op_size);
1511 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001512 }
1513
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001515 /* init OSD command: write or read */
1516 if (do_write)
1517 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001518 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001519 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001520 op_size, bio,
1521 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522 else
1523 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001524 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001526 op_size, bio,
1527 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001529next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 size -= op_size;
1531 ofs += op_size;
1532
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001533 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534 rq_bio = next_bio;
1535 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001536 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001537
1538 if (bp)
1539 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001541
1542 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 }
1544}
1545
1546/*
1547 * a queue callback. Makes sure that we don't create a bio that spans across
1548 * multiple osd objects. One exception would be with a single page bios,
1549 * which we handle later at bio_chain_clone
1550 */
1551static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1552 struct bio_vec *bvec)
1553{
1554 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001555 unsigned int chunk_sectors;
1556 sector_t sector;
1557 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558 int max;
1559
Alex Elder593a9e72012-02-07 12:03:37 -06001560 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1561 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1562 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1563
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001564 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001565 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001566 if (max < 0)
1567 max = 0; /* bio_add cannot handle a negative return */
1568 if (max <= bvec->bv_len && bio_sectors == 0)
1569 return bvec->bv_len;
1570 return max;
1571}
1572
1573static void rbd_free_disk(struct rbd_device *rbd_dev)
1574{
1575 struct gendisk *disk = rbd_dev->disk;
1576
1577 if (!disk)
1578 return;
1579
1580 rbd_header_free(&rbd_dev->header);
1581
1582 if (disk->flags & GENHD_FL_UP)
1583 del_gendisk(disk);
1584 if (disk->queue)
1585 blk_cleanup_queue(disk->queue);
1586 put_disk(disk);
1587}
1588
1589/*
1590 * reload the ondisk the header
1591 */
1592static int rbd_read_header(struct rbd_device *rbd_dev,
1593 struct rbd_image_header *header)
1594{
1595 ssize_t rc;
1596 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001597 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001598 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001599 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001600
Alex Elder00f1f362012-02-07 12:03:36 -06001601 /*
1602 * First reads the fixed-size header to determine the number
1603 * of snapshots, then re-reads it, along with all snapshot
1604 * records as well as their stored names.
1605 */
1606 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001607 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001608 dh = kmalloc(len, GFP_KERNEL);
1609 if (!dh)
1610 return -ENOMEM;
1611
1612 rc = rbd_req_sync_read(rbd_dev,
1613 NULL, CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001614 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001615 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001616 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001617 if (rc < 0)
1618 goto out_dh;
1619
1620 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001621 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001622 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001623 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001624 " for image %s\n",
1625 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001626 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001627 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001628
Alex Elder00f1f362012-02-07 12:03:36 -06001629 if (snap_count == header->total_snaps)
1630 break;
1631
1632 snap_count = header->total_snaps;
1633 len = sizeof (*dh) +
1634 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1635 header->snap_names_len;
1636
1637 rbd_header_free(header);
1638 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001639 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001640 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001641
1642out_dh:
1643 kfree(dh);
1644 return rc;
1645}
1646
1647/*
1648 * create a snapshot
1649 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001650static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001651 const char *snap_name,
1652 gfp_t gfp_flags)
1653{
1654 int name_len = strlen(snap_name);
1655 u64 new_snapid;
1656 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001657 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001658 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001659 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001660
1661 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001662 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001663 return -EINVAL;
1664
Alex Elder0ce1a792012-07-03 16:01:18 -05001665 monc = &rbd_dev->rbd_client->client->monc;
1666 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001667 dout("created snapid=%lld\n", new_snapid);
1668 if (ret < 0)
1669 return ret;
1670
1671 data = kmalloc(name_len + 16, gfp_flags);
1672 if (!data)
1673 return -ENOMEM;
1674
Sage Weil916d4d62011-05-12 16:10:50 -07001675 p = data;
1676 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001677
Sage Weil916d4d62011-05-12 16:10:50 -07001678 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1679 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001680
Alex Elder0bed54d2012-07-03 16:01:18 -05001681 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001682 "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001683 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001684
Sage Weil916d4d62011-05-12 16:10:50 -07001685 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001686
Alex Elder505cbb92012-07-19 08:49:18 -05001687 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001688bad:
1689 return -ERANGE;
1690}
1691
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001692static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1693{
1694 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001695 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001696
Alex Eldera0593292012-07-19 09:09:27 -05001697 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001698 __rbd_remove_snap_dev(rbd_dev, snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001699}
1700
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001701/*
1702 * only read the first part of the ondisk header, without the snaps info
1703 */
Josh Durgin263c6ca2011-12-05 10:43:42 -08001704static int __rbd_refresh_header(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001705{
1706 int ret;
1707 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001708
1709 ret = rbd_read_header(rbd_dev, &h);
1710 if (ret < 0)
1711 return ret;
1712
Josh Durgina51aa0c2011-12-05 10:35:04 -08001713 down_write(&rbd_dev->header_rwsem);
1714
Sage Weil9db4b3e2011-04-19 22:49:06 -07001715 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001716 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1717 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1718
1719 dout("setting size to %llu sectors", (unsigned long long) size);
1720 set_capacity(rbd_dev->disk, size);
1721 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001722
Alex Elder849b4262012-07-09 21:04:24 -05001723 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001724 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001725 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001726 /* osd requests may still refer to snapc */
1727 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001728
Josh Durgina71b8912011-12-05 18:10:44 -08001729 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001730 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001731 rbd_dev->header.total_snaps = h.total_snaps;
1732 rbd_dev->header.snapc = h.snapc;
1733 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001734 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001735 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001736 /* Free the extra copy of the object prefix */
1737 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1738 kfree(h.object_prefix);
1739
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001740 ret = __rbd_init_snaps_header(rbd_dev);
1741
Josh Durginc6666012011-11-21 17:11:12 -08001742 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001743
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001744 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001745}
1746
1747static int rbd_init_disk(struct rbd_device *rbd_dev)
1748{
1749 struct gendisk *disk;
1750 struct request_queue *q;
1751 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001752 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001753 u64 total_size = 0;
1754
1755 /* contact OSD, request size info about the object being mapped */
1756 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1757 if (rc)
1758 return rc;
1759
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001760 /* no need to lock here, as rbd_dev is not registered yet */
1761 rc = __rbd_init_snaps_header(rbd_dev);
1762 if (rc)
1763 return rc;
1764
Josh Durgincc9d7342011-11-21 18:19:13 -08001765 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001766 if (rc)
1767 return rc;
1768
1769 /* create gendisk info */
1770 rc = -ENOMEM;
1771 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1772 if (!disk)
1773 goto out;
1774
Alex Elderf0f8cef2012-01-29 13:57:44 -06001775 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001776 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001777 disk->major = rbd_dev->major;
1778 disk->first_minor = 0;
1779 disk->fops = &rbd_bd_ops;
1780 disk->private_data = rbd_dev;
1781
1782 /* init rq */
1783 rc = -ENOMEM;
1784 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1785 if (!q)
1786 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001787
Alex Elder593a9e72012-02-07 12:03:37 -06001788 /* We use the default size, but let's be explicit about it. */
1789 blk_queue_physical_block_size(q, SECTOR_SIZE);
1790
Josh Durgin029bcbd2011-07-22 11:35:23 -07001791 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001792 segment_size = rbd_obj_bytes(&rbd_dev->header);
1793 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1794 blk_queue_max_segment_size(q, segment_size);
1795 blk_queue_io_min(q, segment_size);
1796 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001797
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001798 blk_queue_merge_bvec(q, rbd_merge_bvec);
1799 disk->queue = q;
1800
1801 q->queuedata = rbd_dev;
1802
1803 rbd_dev->disk = disk;
1804 rbd_dev->q = q;
1805
1806 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001807 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001808 add_disk(disk);
1809
1810 pr_info("%s: added with size 0x%llx\n",
1811 disk->disk_name, (unsigned long long)total_size);
1812 return 0;
1813
1814out_disk:
1815 put_disk(disk);
1816out:
1817 return rc;
1818}
1819
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001820/*
1821 sysfs
1822*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001823
Alex Elder593a9e72012-02-07 12:03:37 -06001824static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1825{
1826 return container_of(dev, struct rbd_device, dev);
1827}
1828
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001829static ssize_t rbd_size_show(struct device *dev,
1830 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001831{
Alex Elder593a9e72012-02-07 12:03:37 -06001832 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001833 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001834
Josh Durgina51aa0c2011-12-05 10:35:04 -08001835 down_read(&rbd_dev->header_rwsem);
1836 size = get_capacity(rbd_dev->disk);
1837 up_read(&rbd_dev->header_rwsem);
1838
1839 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840}
1841
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001842static ssize_t rbd_major_show(struct device *dev,
1843 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001844{
Alex Elder593a9e72012-02-07 12:03:37 -06001845 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001846
1847 return sprintf(buf, "%d\n", rbd_dev->major);
1848}
1849
1850static ssize_t rbd_client_id_show(struct device *dev,
1851 struct device_attribute *attr, char *buf)
1852{
Alex Elder593a9e72012-02-07 12:03:37 -06001853 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001854
Alex Elder1dbb4392012-01-24 10:08:37 -06001855 return sprintf(buf, "client%lld\n",
1856 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001857}
1858
1859static ssize_t rbd_pool_show(struct device *dev,
1860 struct device_attribute *attr, char *buf)
1861{
Alex Elder593a9e72012-02-07 12:03:37 -06001862 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001863
1864 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1865}
1866
Alex Elder9bb2f332012-07-12 10:46:35 -05001867static ssize_t rbd_pool_id_show(struct device *dev,
1868 struct device_attribute *attr, char *buf)
1869{
1870 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1871
1872 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1873}
1874
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001875static ssize_t rbd_name_show(struct device *dev,
1876 struct device_attribute *attr, char *buf)
1877{
Alex Elder593a9e72012-02-07 12:03:37 -06001878 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001879
Alex Elder0bed54d2012-07-03 16:01:18 -05001880 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001881}
1882
1883static ssize_t rbd_snap_show(struct device *dev,
1884 struct device_attribute *attr,
1885 char *buf)
1886{
Alex Elder593a9e72012-02-07 12:03:37 -06001887 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001888
1889 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1890}
1891
1892static ssize_t rbd_image_refresh(struct device *dev,
1893 struct device_attribute *attr,
1894 const char *buf,
1895 size_t size)
1896{
Alex Elder593a9e72012-02-07 12:03:37 -06001897 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001898 int rc;
1899 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001900
1901 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1902
Josh Durgin263c6ca2011-12-05 10:43:42 -08001903 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001904 if (rc < 0)
1905 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001906
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001907 mutex_unlock(&ctl_mutex);
1908 return ret;
1909}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001910
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001911static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1912static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1913static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1914static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001915static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001916static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1917static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1918static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1919static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001920
1921static struct attribute *rbd_attrs[] = {
1922 &dev_attr_size.attr,
1923 &dev_attr_major.attr,
1924 &dev_attr_client_id.attr,
1925 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001926 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001927 &dev_attr_name.attr,
1928 &dev_attr_current_snap.attr,
1929 &dev_attr_refresh.attr,
1930 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001931 NULL
1932};
1933
1934static struct attribute_group rbd_attr_group = {
1935 .attrs = rbd_attrs,
1936};
1937
1938static const struct attribute_group *rbd_attr_groups[] = {
1939 &rbd_attr_group,
1940 NULL
1941};
1942
1943static void rbd_sysfs_dev_release(struct device *dev)
1944{
1945}
1946
1947static struct device_type rbd_device_type = {
1948 .name = "rbd",
1949 .groups = rbd_attr_groups,
1950 .release = rbd_sysfs_dev_release,
1951};
1952
1953
1954/*
1955 sysfs - snapshots
1956*/
1957
1958static ssize_t rbd_snap_size_show(struct device *dev,
1959 struct device_attribute *attr,
1960 char *buf)
1961{
1962 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1963
Josh Durgin35915382011-12-05 18:25:13 -08001964 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001965}
1966
1967static ssize_t rbd_snap_id_show(struct device *dev,
1968 struct device_attribute *attr,
1969 char *buf)
1970{
1971 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1972
Josh Durgin35915382011-12-05 18:25:13 -08001973 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001974}
1975
1976static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1977static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1978
1979static struct attribute *rbd_snap_attrs[] = {
1980 &dev_attr_snap_size.attr,
1981 &dev_attr_snap_id.attr,
1982 NULL,
1983};
1984
1985static struct attribute_group rbd_snap_attr_group = {
1986 .attrs = rbd_snap_attrs,
1987};
1988
1989static void rbd_snap_dev_release(struct device *dev)
1990{
1991 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1992 kfree(snap->name);
1993 kfree(snap);
1994}
1995
1996static const struct attribute_group *rbd_snap_attr_groups[] = {
1997 &rbd_snap_attr_group,
1998 NULL
1999};
2000
2001static struct device_type rbd_snap_device_type = {
2002 .groups = rbd_snap_attr_groups,
2003 .release = rbd_snap_dev_release,
2004};
2005
2006static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
2007 struct rbd_snap *snap)
2008{
2009 list_del(&snap->node);
2010 device_unregister(&snap->dev);
2011}
2012
2013static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2014 struct rbd_snap *snap,
2015 struct device *parent)
2016{
2017 struct device *dev = &snap->dev;
2018 int ret;
2019
2020 dev->type = &rbd_snap_device_type;
2021 dev->parent = parent;
2022 dev->release = rbd_snap_dev_release;
2023 dev_set_name(dev, "snap_%s", snap->name);
2024 ret = device_register(dev);
2025
2026 return ret;
2027}
2028
2029static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2030 int i, const char *name,
2031 struct rbd_snap **snapp)
2032{
2033 int ret;
2034 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2035 if (!snap)
2036 return -ENOMEM;
2037 snap->name = kstrdup(name, GFP_KERNEL);
2038 snap->size = rbd_dev->header.snap_sizes[i];
2039 snap->id = rbd_dev->header.snapc->snaps[i];
2040 if (device_is_registered(&rbd_dev->dev)) {
2041 ret = rbd_register_snap_dev(rbd_dev, snap,
2042 &rbd_dev->dev);
2043 if (ret < 0)
2044 goto err;
2045 }
2046 *snapp = snap;
2047 return 0;
2048err:
2049 kfree(snap->name);
2050 kfree(snap);
2051 return ret;
2052}
2053
2054/*
2055 * search for the previous snap in a null delimited string list
2056 */
2057const char *rbd_prev_snap_name(const char *name, const char *start)
2058{
2059 if (name < start + 2)
2060 return NULL;
2061
2062 name -= 2;
2063 while (*name) {
2064 if (name == start)
2065 return start;
2066 name--;
2067 }
2068 return name + 1;
2069}
2070
2071/*
2072 * compare the old list of snapshots that we have to what's in the header
2073 * and update it accordingly. Note that the header holds the snapshots
2074 * in a reverse order (from newest to oldest) and we need to go from
2075 * older to new so that we don't get a duplicate snap name when
2076 * doing the process (e.g., removed snapshot and recreated a new
2077 * one with the same name.
2078 */
2079static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2080{
2081 const char *name, *first_name;
2082 int i = rbd_dev->header.total_snaps;
2083 struct rbd_snap *snap, *old_snap = NULL;
2084 int ret;
2085 struct list_head *p, *n;
2086
2087 first_name = rbd_dev->header.snap_names;
2088 name = first_name + rbd_dev->header.snap_names_len;
2089
2090 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2091 u64 cur_id;
2092
2093 old_snap = list_entry(p, struct rbd_snap, node);
2094
2095 if (i)
2096 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2097
2098 if (!i || old_snap->id < cur_id) {
Josh Durgine88a36e2011-11-21 18:14:25 -08002099 /*
2100 * old_snap->id was skipped, thus was
2101 * removed. If this rbd_dev is mapped to
2102 * the removed snapshot, record that it no
2103 * longer exists, to prevent further I/O.
2104 */
2105 if (rbd_dev->snap_id == old_snap->id)
2106 rbd_dev->snap_exists = false;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002107 __rbd_remove_snap_dev(rbd_dev, old_snap);
2108 continue;
2109 }
2110 if (old_snap->id == cur_id) {
2111 /* we have this snapshot already */
2112 i--;
2113 name = rbd_prev_snap_name(name, first_name);
2114 continue;
2115 }
2116 for (; i > 0;
2117 i--, name = rbd_prev_snap_name(name, first_name)) {
2118 if (!name) {
2119 WARN_ON(1);
2120 return -EINVAL;
2121 }
2122 cur_id = rbd_dev->header.snapc->snaps[i];
2123 /* snapshot removal? handle it above */
2124 if (cur_id >= old_snap->id)
2125 break;
2126 /* a new snapshot */
2127 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2128 if (ret < 0)
2129 return ret;
2130
2131 /* note that we add it backward so using n and not p */
2132 list_add(&snap->node, n);
2133 p = &snap->node;
2134 }
2135 }
2136 /* we're done going over the old snap list, just add what's left */
2137 for (; i > 0; i--) {
2138 name = rbd_prev_snap_name(name, first_name);
2139 if (!name) {
2140 WARN_ON(1);
2141 return -EINVAL;
2142 }
2143 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2144 if (ret < 0)
2145 return ret;
2146 list_add(&snap->node, &rbd_dev->snaps);
2147 }
2148
2149 return 0;
2150}
2151
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002152static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2153{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002154 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002155 struct device *dev;
2156 struct rbd_snap *snap;
2157
2158 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2159 dev = &rbd_dev->dev;
2160
2161 dev->bus = &rbd_bus_type;
2162 dev->type = &rbd_device_type;
2163 dev->parent = &rbd_root_dev;
2164 dev->release = rbd_dev_release;
2165 dev_set_name(dev, "%d", rbd_dev->id);
2166 ret = device_register(dev);
2167 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002168 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002169
2170 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2171 ret = rbd_register_snap_dev(rbd_dev, snap,
2172 &rbd_dev->dev);
2173 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002174 break;
2175 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002176out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002177 mutex_unlock(&ctl_mutex);
2178 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002179}
2180
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002181static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2182{
2183 device_unregister(&rbd_dev->dev);
2184}
2185
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002186static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2187{
2188 int ret, rc;
2189
2190 do {
Alex Elder0bed54d2012-07-03 16:01:18 -05002191 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002192 rbd_dev->header.obj_version);
2193 if (ret == -ERANGE) {
2194 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08002195 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002196 mutex_unlock(&ctl_mutex);
2197 if (rc < 0)
2198 return rc;
2199 }
2200 } while (ret == -ERANGE);
2201
2202 return ret;
2203}
2204
Alex Elder1ddbe942012-01-29 13:57:44 -06002205static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2206
2207/*
Alex Elder499afd52012-02-02 08:13:29 -06002208 * Get a unique rbd identifier for the given new rbd_dev, and add
2209 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002210 */
Alex Elder499afd52012-02-02 08:13:29 -06002211static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002212{
Alex Elder499afd52012-02-02 08:13:29 -06002213 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2214
2215 spin_lock(&rbd_dev_list_lock);
2216 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2217 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002218}
Alex Elderb7f23c32012-01-29 13:57:43 -06002219
Alex Elder1ddbe942012-01-29 13:57:44 -06002220/*
Alex Elder499afd52012-02-02 08:13:29 -06002221 * Remove an rbd_dev from the global list, and record that its
2222 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002223 */
Alex Elder499afd52012-02-02 08:13:29 -06002224static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002225{
Alex Elderd184f6b2012-01-29 13:57:44 -06002226 struct list_head *tmp;
2227 int rbd_id = rbd_dev->id;
2228 int max_id;
2229
2230 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002231
2232 spin_lock(&rbd_dev_list_lock);
2233 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002234
2235 /*
2236 * If the id being "put" is not the current maximum, there
2237 * is nothing special we need to do.
2238 */
2239 if (rbd_id != atomic64_read(&rbd_id_max)) {
2240 spin_unlock(&rbd_dev_list_lock);
2241 return;
2242 }
2243
2244 /*
2245 * We need to update the current maximum id. Search the
2246 * list to find out what it is. We're more likely to find
2247 * the maximum at the end, so search the list backward.
2248 */
2249 max_id = 0;
2250 list_for_each_prev(tmp, &rbd_dev_list) {
2251 struct rbd_device *rbd_dev;
2252
2253 rbd_dev = list_entry(tmp, struct rbd_device, node);
2254 if (rbd_id > max_id)
2255 max_id = rbd_id;
2256 }
Alex Elder499afd52012-02-02 08:13:29 -06002257 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002258
Alex Elder1ddbe942012-01-29 13:57:44 -06002259 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002260 * The max id could have been updated by rbd_id_get(), in
2261 * which case it now accurately reflects the new maximum.
2262 * Be careful not to overwrite the maximum value in that
2263 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002264 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002265 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002266}
2267
Alex Eldera725f65e2012-02-02 08:13:30 -06002268/*
Alex Eldere28fff262012-02-02 08:13:30 -06002269 * Skips over white space at *buf, and updates *buf to point to the
2270 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002271 * the token (string of non-white space characters) found. Note
2272 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002273 */
2274static inline size_t next_token(const char **buf)
2275{
2276 /*
2277 * These are the characters that produce nonzero for
2278 * isspace() in the "C" and "POSIX" locales.
2279 */
2280 const char *spaces = " \f\n\r\t\v";
2281
2282 *buf += strspn(*buf, spaces); /* Find start of token */
2283
2284 return strcspn(*buf, spaces); /* Return token length */
2285}
2286
2287/*
2288 * Finds the next token in *buf, and if the provided token buffer is
2289 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002290 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2291 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002292 *
2293 * Returns the length of the token found (not including the '\0').
2294 * Return value will be 0 if no token is found, and it will be >=
2295 * token_size if the token would not fit.
2296 *
Alex Elder593a9e72012-02-07 12:03:37 -06002297 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002298 * found token. Note that this occurs even if the token buffer is
2299 * too small to hold it.
2300 */
2301static inline size_t copy_token(const char **buf,
2302 char *token,
2303 size_t token_size)
2304{
2305 size_t len;
2306
2307 len = next_token(buf);
2308 if (len < token_size) {
2309 memcpy(token, *buf, len);
2310 *(token + len) = '\0';
2311 }
2312 *buf += len;
2313
2314 return len;
2315}
2316
2317/*
Alex Elderea3352f2012-07-09 21:04:23 -05002318 * Finds the next token in *buf, dynamically allocates a buffer big
2319 * enough to hold a copy of it, and copies the token into the new
2320 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2321 * that a duplicate buffer is created even for a zero-length token.
2322 *
2323 * Returns a pointer to the newly-allocated duplicate, or a null
2324 * pointer if memory for the duplicate was not available. If
2325 * the lenp argument is a non-null pointer, the length of the token
2326 * (not including the '\0') is returned in *lenp.
2327 *
2328 * If successful, the *buf pointer will be updated to point beyond
2329 * the end of the found token.
2330 *
2331 * Note: uses GFP_KERNEL for allocation.
2332 */
2333static inline char *dup_token(const char **buf, size_t *lenp)
2334{
2335 char *dup;
2336 size_t len;
2337
2338 len = next_token(buf);
2339 dup = kmalloc(len + 1, GFP_KERNEL);
2340 if (!dup)
2341 return NULL;
2342
2343 memcpy(dup, *buf, len);
2344 *(dup + len) = '\0';
2345 *buf += len;
2346
2347 if (lenp)
2348 *lenp = len;
2349
2350 return dup;
2351}
2352
2353/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002354 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002355 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2356 * on the list of monitor addresses and other options provided via
2357 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002358 *
2359 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002360 */
2361static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2362 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002363 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002364 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002365 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002366 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002367{
Alex Elderd22f76e2012-07-12 10:46:35 -05002368 size_t len;
2369 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002370
2371 /* The first four tokens are required */
2372
Alex Elder7ef32142012-02-02 08:13:30 -06002373 len = next_token(&buf);
2374 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002375 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002376 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002377 *mon_addrs = buf;
2378
2379 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002380
Alex Eldere28fff262012-02-02 08:13:30 -06002381 len = copy_token(&buf, options, options_size);
2382 if (!len || len >= options_size)
2383 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002384
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002385 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002386 rbd_dev->pool_name = dup_token(&buf, NULL);
2387 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002388 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002389
Alex Elder0bed54d2012-07-03 16:01:18 -05002390 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2391 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002392 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002393
Alex Eldercb8627c2012-07-09 21:04:23 -05002394 /* Create the name of the header object */
2395
Alex Elder0bed54d2012-07-03 16:01:18 -05002396 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002397 + sizeof (RBD_SUFFIX),
2398 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002399 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002400 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002401 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002402
Alex Eldere28fff262012-02-02 08:13:30 -06002403 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002404 * The snapshot name is optional. If none is is supplied,
2405 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002406 */
Alex Elder820a5f32012-07-09 21:04:24 -05002407 rbd_dev->snap_name = dup_token(&buf, &len);
2408 if (!rbd_dev->snap_name)
2409 goto out_err;
2410 if (!len) {
2411 /* Replace the empty name with the default */
2412 kfree(rbd_dev->snap_name);
2413 rbd_dev->snap_name
2414 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2415 if (!rbd_dev->snap_name)
2416 goto out_err;
2417
Alex Eldere28fff262012-02-02 08:13:30 -06002418 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2419 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002420 }
Alex Eldere28fff262012-02-02 08:13:30 -06002421
Alex Eldera725f65e2012-02-02 08:13:30 -06002422 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002423
2424out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002425 kfree(rbd_dev->header_name);
2426 kfree(rbd_dev->image_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002427 kfree(rbd_dev->pool_name);
2428 rbd_dev->pool_name = NULL;
2429
2430 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002431}
2432
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002433static ssize_t rbd_add(struct bus_type *bus,
2434 const char *buf,
2435 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002436{
Alex Eldercb8627c2012-07-09 21:04:23 -05002437 char *options;
2438 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002439 const char *mon_addrs = NULL;
2440 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002441 struct ceph_osd_client *osdc;
2442 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002443
2444 if (!try_module_get(THIS_MODULE))
2445 return -ENODEV;
2446
Alex Elder27cc2592012-02-02 08:13:30 -06002447 options = kmalloc(count, GFP_KERNEL);
2448 if (!options)
2449 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002450 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2451 if (!rbd_dev)
2452 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002453
2454 /* static rbd_device initialization */
2455 spin_lock_init(&rbd_dev->lock);
2456 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002457 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002458 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002459
Alex Elderd184f6b2012-01-29 13:57:44 -06002460 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002461 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002462
Alex Eldera725f65e2012-02-02 08:13:30 -06002463 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002464 BUILD_BUG_ON(DEV_NAME_LEN
2465 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2466 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
Alex Eldere124a822012-01-29 13:57:44 -06002467
Alex Eldera725f65e2012-02-02 08:13:30 -06002468 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002469 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002470 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002471 if (rc)
2472 goto err_put_id;
2473
Alex Elder5214ecc2012-02-02 08:13:30 -06002474 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2475 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002476 if (IS_ERR(rbd_dev->rbd_client)) {
2477 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002478 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002479 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002480
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002481 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002482 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002483 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2484 if (rc < 0)
2485 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002486 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002487
2488 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002489 rc = register_blkdev(0, rbd_dev->name);
2490 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002491 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002492 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002493
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002494 rc = rbd_bus_add_dev(rbd_dev);
2495 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002496 goto err_out_blkdev;
2497
Alex Elder32eec682012-02-08 16:11:14 -06002498 /*
2499 * At this point cleanup in the event of an error is the job
2500 * of the sysfs code (initiated by rbd_bus_del_dev()).
2501 *
2502 * Set up and announce blkdev mapping.
2503 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002504 rc = rbd_init_disk(rbd_dev);
2505 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002506 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002507
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002508 rc = rbd_init_watch_dev(rbd_dev);
2509 if (rc)
2510 goto err_out_bus;
2511
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002512 return count;
2513
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002514err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002515 /* this will also clean up rest of rbd_dev stuff */
2516
2517 rbd_bus_del_dev(rbd_dev);
2518 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002519 return rc;
2520
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002521err_out_blkdev:
2522 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2523err_out_client:
2524 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002525err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002526 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002527 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002528 kfree(rbd_dev->header_name);
2529 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002530 kfree(rbd_dev->pool_name);
2531 }
Alex Elder499afd52012-02-02 08:13:29 -06002532 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002533err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002534 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002535 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002536
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002537 dout("Error adding device %s\n", buf);
2538 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002539
2540 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002541}
2542
2543static struct rbd_device *__rbd_get_dev(unsigned long id)
2544{
2545 struct list_head *tmp;
2546 struct rbd_device *rbd_dev;
2547
Alex Eldere124a822012-01-29 13:57:44 -06002548 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002549 list_for_each(tmp, &rbd_dev_list) {
2550 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a822012-01-29 13:57:44 -06002551 if (rbd_dev->id == id) {
2552 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002553 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002554 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002555 }
Alex Eldere124a822012-01-29 13:57:44 -06002556 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002557 return NULL;
2558}
2559
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002560static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002561{
Alex Elder593a9e72012-02-07 12:03:37 -06002562 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002563
Alex Elder1dbb4392012-01-24 10:08:37 -06002564 if (rbd_dev->watch_request) {
2565 struct ceph_client *client = rbd_dev->rbd_client->client;
2566
2567 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002568 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002569 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002570 if (rbd_dev->watch_event)
Alex Elder0bed54d2012-07-03 16:01:18 -05002571 rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002572
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002573 rbd_put_client(rbd_dev);
2574
2575 /* clean up and free blkdev */
2576 rbd_free_disk(rbd_dev);
2577 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002578
2579 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002580 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002581 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002582 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002583 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002584 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002585 kfree(rbd_dev);
2586
2587 /* release module ref */
2588 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002589}
2590
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002591static ssize_t rbd_remove(struct bus_type *bus,
2592 const char *buf,
2593 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002594{
2595 struct rbd_device *rbd_dev = NULL;
2596 int target_id, rc;
2597 unsigned long ul;
2598 int ret = count;
2599
2600 rc = strict_strtoul(buf, 10, &ul);
2601 if (rc)
2602 return rc;
2603
2604 /* convert to int; abort if we lost anything in the conversion */
2605 target_id = (int) ul;
2606 if (target_id != ul)
2607 return -EINVAL;
2608
2609 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2610
2611 rbd_dev = __rbd_get_dev(target_id);
2612 if (!rbd_dev) {
2613 ret = -ENOENT;
2614 goto done;
2615 }
2616
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002617 __rbd_remove_all_snaps(rbd_dev);
2618 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002619
2620done:
2621 mutex_unlock(&ctl_mutex);
2622 return ret;
2623}
2624
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002625static ssize_t rbd_snap_add(struct device *dev,
2626 struct device_attribute *attr,
2627 const char *buf,
2628 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002629{
Alex Elder593a9e72012-02-07 12:03:37 -06002630 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002631 int ret;
2632 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002633 if (!name)
2634 return -ENOMEM;
2635
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002636 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002637
2638 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2639
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002640 ret = rbd_header_add_snap(rbd_dev,
2641 name, GFP_KERNEL);
2642 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002643 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002644
Josh Durgin263c6ca2011-12-05 10:43:42 -08002645 ret = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002646 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002647 goto err_unlock;
2648
2649 /* shouldn't hold ctl_mutex when notifying.. notify might
2650 trigger a watch callback that would need to get that mutex */
2651 mutex_unlock(&ctl_mutex);
2652
2653 /* make a best effort, don't error if failed */
Alex Elder0bed54d2012-07-03 16:01:18 -05002654 rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002655
2656 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002657 kfree(name);
2658 return ret;
2659
2660err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002661 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002662 kfree(name);
2663 return ret;
2664}
2665
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666/*
2667 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002668 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002669 */
2670static int rbd_sysfs_init(void)
2671{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002672 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002673
Alex Elderfed4c142012-02-07 12:03:36 -06002674 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002675 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002676 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002677
Alex Elderfed4c142012-02-07 12:03:36 -06002678 ret = bus_register(&rbd_bus_type);
2679 if (ret < 0)
2680 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002681
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002682 return ret;
2683}
2684
2685static void rbd_sysfs_cleanup(void)
2686{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002687 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002688 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002689}
2690
2691int __init rbd_init(void)
2692{
2693 int rc;
2694
2695 rc = rbd_sysfs_init();
2696 if (rc)
2697 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002698 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002699 return 0;
2700}
2701
2702void __exit rbd_exit(void)
2703{
2704 rbd_sysfs_cleanup();
2705}
2706
2707module_init(rbd_init);
2708module_exit(rbd_exit);
2709
2710MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2711MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2712MODULE_DESCRIPTION("rados block device");
2713
2714/* following authorship retained from original osdblk.c */
2715MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2716
2717MODULE_LICENSE("GPL");