blob: 43f6ef8d696f093f3ef28df4456c971a227f32c4 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Alex Eldercc0538b2012-08-10 13:12:07 -070072#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070073
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070084 u32 total_snaps;
85
86 char *snap_names;
87 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070088
89 u64 obj_version;
90};
91
92struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -070093 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070094};
95
96/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060097 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098 */
99struct rbd_client {
100 struct ceph_client *client;
101 struct kref kref;
102 struct list_head node;
103};
104
105/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600106 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700107 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700108struct rbd_req_status {
109 int done;
110 int rc;
111 u64 bytes;
112};
113
114/*
115 * a collection of requests
116 */
117struct rbd_req_coll {
118 int total;
119 int num_done;
120 struct kref kref;
121 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700122};
123
Alex Elderf0f8cef2012-01-29 13:57:44 -0600124/*
125 * a single io request
126 */
127struct rbd_request {
128 struct request *rq; /* blk layer request */
129 struct bio *bio; /* cloned bio */
130 struct page **pages; /* list of used pages */
131 u64 len;
132 int coll_index;
133 struct rbd_req_coll *coll;
134};
135
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800136struct rbd_snap {
137 struct device dev;
138 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800139 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800140 struct list_head node;
141 u64 id;
142};
143
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700144/*
145 * a single device
146 */
147struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500148 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700149
150 int major; /* blkdev assigned major */
151 struct gendisk *disk; /* blkdev's gendisk and rq */
152 struct request_queue *q;
153
Alex Elderf8c38922012-08-10 13:12:07 -0700154 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700155 struct rbd_client *rbd_client;
156
157 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
158
159 spinlock_t lock; /* queue lock */
160
161 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500162 char *image_name;
163 size_t image_name_len;
164 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500165 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500166 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700167
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700168 struct ceph_osd_event *watch_event;
169 struct ceph_osd_request *watch_request;
170
Josh Durginc6666012011-11-21 17:11:12 -0800171 /* protects updating the header */
172 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800173 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500174 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800175 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800176 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800177 /* whether the snap_id this device reads from still exists */
178 bool snap_exists;
Alex Eldercc0538b2012-08-10 13:12:07 -0700179 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700180
181 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800182
183 /* list of snapshots */
184 struct list_head snaps;
185
186 /* sysfs related */
187 struct device dev;
188};
189
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700190static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600191
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700192static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600193static DEFINE_SPINLOCK(rbd_dev_list_lock);
194
Alex Elder432b8582012-01-29 13:57:44 -0600195static LIST_HEAD(rbd_client_list); /* clients */
196static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700197
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800198static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
199static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800200static ssize_t rbd_snap_add(struct device *dev,
201 struct device_attribute *attr,
202 const char *buf,
203 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500204static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800205
Alex Elderf0f8cef2012-01-29 13:57:44 -0600206static ssize_t rbd_add(struct bus_type *bus, const char *buf,
207 size_t count);
208static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
209 size_t count);
210
211static struct bus_attribute rbd_bus_attrs[] = {
212 __ATTR(add, S_IWUSR, NULL, rbd_add),
213 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
214 __ATTR_NULL
215};
216
217static struct bus_type rbd_bus_type = {
218 .name = "rbd",
219 .bus_attrs = rbd_bus_attrs,
220};
221
222static void rbd_root_dev_release(struct device *dev)
223{
224}
225
226static struct device rbd_root_dev = {
227 .init_name = "rbd",
228 .release = rbd_root_dev_release,
229};
230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
233{
234 return get_device(&rbd_dev->dev);
235}
236
237static void rbd_put_dev(struct rbd_device *rbd_dev)
238{
239 put_device(&rbd_dev->dev);
240}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700241
Alex Elder1fe5e992012-07-25 09:32:41 -0500242static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700243
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700244static int rbd_open(struct block_device *bdev, fmode_t mode)
245{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600246 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700247
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
249 return -EROFS;
250
Alex Elder340c7a22012-08-10 13:12:07 -0700251 rbd_get_dev(rbd_dev);
252 set_device_ro(bdev, rbd_dev->read_only);
253
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700254 return 0;
255}
256
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800257static int rbd_release(struct gendisk *disk, fmode_t mode)
258{
259 struct rbd_device *rbd_dev = disk->private_data;
260
261 rbd_put_dev(rbd_dev);
262
263 return 0;
264}
265
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700266static const struct block_device_operations rbd_bd_ops = {
267 .owner = THIS_MODULE,
268 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800269 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270};
271
272/*
273 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500274 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700275 */
Alex Elderf8c38922012-08-10 13:12:07 -0700276static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700277{
278 struct rbd_client *rbdc;
279 int ret = -ENOMEM;
280
281 dout("rbd_client_create\n");
282 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
283 if (!rbdc)
284 goto out_opt;
285
286 kref_init(&rbdc->kref);
287 INIT_LIST_HEAD(&rbdc->node);
288
Alex Elderbc534d82012-01-29 13:57:44 -0600289 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
290
Alex Elder43ae4702012-07-03 16:01:18 -0500291 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700292 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600293 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500294 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295
296 ret = ceph_open_session(rbdc->client);
297 if (ret < 0)
298 goto out_err;
299
Alex Elder432b8582012-01-29 13:57:44 -0600300 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700301 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600302 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700303
Alex Elderbc534d82012-01-29 13:57:44 -0600304 mutex_unlock(&ctl_mutex);
305
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306 dout("rbd_client_create created %p\n", rbdc);
307 return rbdc;
308
309out_err:
310 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600311out_mutex:
312 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700313 kfree(rbdc);
314out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500315 if (ceph_opts)
316 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400317 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318}
319
320/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700321 * Find a ceph client with specific addr and configuration. If
322 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700323 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700324static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700325{
326 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700327 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328
Alex Elder43ae4702012-07-03 16:01:18 -0500329 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700330 return NULL;
331
Alex Elder1f7ba332012-08-10 13:12:07 -0700332 spin_lock(&rbd_client_list_lock);
333 list_for_each_entry(client_node, &rbd_client_list, node) {
334 if (!ceph_compare_options(ceph_opts, client_node->client)) {
335 kref_get(&client_node->kref);
336 found = true;
337 break;
338 }
339 }
340 spin_unlock(&rbd_client_list_lock);
341
342 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700343}
344
345/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700346 * mount options
347 */
348enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700349 Opt_last_int,
350 /* int args above */
351 Opt_last_string,
352 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700353 Opt_read_only,
354 Opt_read_write,
355 /* Boolean args above */
356 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700357};
358
Alex Elder43ae4702012-07-03 16:01:18 -0500359static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700360 /* int args above */
361 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700362 {Opt_read_only, "read_only"},
363 {Opt_read_only, "ro"}, /* Alternate spelling */
364 {Opt_read_write, "read_write"},
365 {Opt_read_write, "rw"}, /* Alternate spelling */
366 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700367 {-1, NULL}
368};
369
370static int parse_rbd_opts_token(char *c, void *private)
371{
Alex Elder43ae4702012-07-03 16:01:18 -0500372 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700373 substring_t argstr[MAX_OPT_ARGS];
374 int token, intval, ret;
375
Alex Elder43ae4702012-07-03 16:01:18 -0500376 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700377 if (token < 0)
378 return -EINVAL;
379
380 if (token < Opt_last_int) {
381 ret = match_int(&argstr[0], &intval);
382 if (ret < 0) {
383 pr_err("bad mount option arg (not int) "
384 "at '%s'\n", c);
385 return ret;
386 }
387 dout("got int token %d val %d\n", token, intval);
388 } else if (token > Opt_last_int && token < Opt_last_string) {
389 dout("got string token %d val %s\n", token,
390 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700391 } else if (token > Opt_last_string && token < Opt_last_bool) {
392 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700393 } else {
394 dout("got token %d\n", token);
395 }
396
397 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700398 case Opt_read_only:
399 rbd_opts->read_only = true;
400 break;
401 case Opt_read_write:
402 rbd_opts->read_only = false;
403 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700404 default:
405 BUG_ON(token);
406 }
407 return 0;
408}
409
410/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700411 * Get a ceph client with specific addr and configuration, if one does
412 * not exist create it.
413 */
Alex Elderf8c38922012-08-10 13:12:07 -0700414static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
415 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700416{
Alex Elderf8c38922012-08-10 13:12:07 -0700417 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500418 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700419 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700420
Alex Eldercc0538b2012-08-10 13:12:07 -0700421 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700422
Alex Elder43ae4702012-07-03 16:01:18 -0500423 ceph_opts = ceph_parse_options(options, mon_addr,
424 mon_addr + mon_addr_len,
425 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700426 if (IS_ERR(ceph_opts))
427 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700428
Alex Elder1f7ba332012-08-10 13:12:07 -0700429 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700430 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600431 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500432 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700433 } else {
434 rbdc = rbd_client_create(ceph_opts);
435 if (IS_ERR(rbdc))
436 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437 }
Alex Elderf8c38922012-08-10 13:12:07 -0700438 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439
Alex Elderf8c38922012-08-10 13:12:07 -0700440 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441}
442
443/*
444 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600445 *
Alex Elder432b8582012-01-29 13:57:44 -0600446 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700447 */
448static void rbd_client_release(struct kref *kref)
449{
450 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
451
452 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500453 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700454 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500455 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456
457 ceph_destroy_client(rbdc->client);
458 kfree(rbdc);
459}
460
461/*
462 * Drop reference to ceph client node. If it's not referenced anymore, release
463 * it.
464 */
465static void rbd_put_client(struct rbd_device *rbd_dev)
466{
467 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
468 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469}
470
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700471/*
472 * Destroy requests collection
473 */
474static void rbd_coll_release(struct kref *kref)
475{
476 struct rbd_req_coll *coll =
477 container_of(kref, struct rbd_req_coll, kref);
478
479 dout("rbd_coll_release %p\n", coll);
480 kfree(coll);
481}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700482
Alex Elder8e94af82012-07-25 09:32:40 -0500483static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
484{
Alex Elder103a1502012-08-02 11:29:45 -0500485 size_t size;
486 u32 snap_count;
487
488 /* The header has to start with the magic rbd header text */
489 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
490 return false;
491
492 /*
493 * The size of a snapshot header has to fit in a size_t, and
494 * that limits the number of snapshots.
495 */
496 snap_count = le32_to_cpu(ondisk->snap_count);
497 size = SIZE_MAX - sizeof (struct ceph_snap_context);
498 if (snap_count > size / sizeof (__le64))
499 return false;
500
501 /*
502 * Not only that, but the size of the entire the snapshot
503 * header must also be representable in a size_t.
504 */
505 size -= snap_count * sizeof (__le64);
506 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
507 return false;
508
509 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500510}
511
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700512/*
513 * Create a new header structure, translate header format from the on-disk
514 * header.
515 */
516static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500517 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700518{
Alex Elderccece232012-07-10 20:30:10 -0500519 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500520 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500521 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500522 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700523
Alex Elder6a523252012-07-19 17:12:59 -0500524 memset(header, 0, sizeof (*header));
525
Alex Elder103a1502012-08-02 11:29:45 -0500526 snap_count = le32_to_cpu(ondisk->snap_count);
527
Alex Elder58c17b02012-08-23 23:22:06 -0500528 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
529 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500530 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700531 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500532 memcpy(header->object_prefix, ondisk->object_prefix, len);
533 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600534
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700535 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500536 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
537
Alex Elder621901d2012-08-23 23:22:06 -0500538 /* Save a copy of the snapshot names */
539
Alex Elderf785cc12012-08-23 23:22:06 -0500540 if (snap_names_len > (u64) SIZE_MAX)
541 return -EIO;
542 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700543 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500544 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500545 /*
546 * Note that rbd_dev_v1_header_read() guarantees
547 * the ondisk buffer we're working with has
548 * snap_names_len bytes beyond the end of the
549 * snapshot id array, this memcpy() is safe.
550 */
551 memcpy(header->snap_names, &ondisk->snaps[snap_count],
552 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500553
Alex Elder621901d2012-08-23 23:22:06 -0500554 /* Record each snapshot's size */
555
Alex Elderd2bb24e2012-07-26 23:37:14 -0500556 size = snap_count * sizeof (*header->snap_sizes);
557 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500559 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500560 for (i = 0; i < snap_count; i++)
561 header->snap_sizes[i] =
562 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700563 } else {
Alex Elderccece232012-07-10 20:30:10 -0500564 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565 header->snap_names = NULL;
566 header->snap_sizes = NULL;
567 }
Alex Elder849b4262012-07-09 21:04:24 -0500568
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700569 header->image_size = le64_to_cpu(ondisk->image_size);
570 header->obj_order = ondisk->options.order;
571 header->crypt_type = ondisk->options.crypt_type;
572 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500573 header->total_snaps = snap_count;
574
Alex Elder621901d2012-08-23 23:22:06 -0500575 /* Allocate and fill in the snapshot context */
576
Alex Elder6a523252012-07-19 17:12:59 -0500577 size = sizeof (struct ceph_snap_context);
578 size += snap_count * sizeof (header->snapc->snaps[0]);
579 header->snapc = kzalloc(size, GFP_KERNEL);
580 if (!header->snapc)
581 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700582
583 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500584 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500586 for (i = 0; i < snap_count; i++)
587 header->snapc->snaps[i] =
588 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700589
590 return 0;
591
Alex Elder6a523252012-07-19 17:12:59 -0500592out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500593 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500594 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700595 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500596 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500597 kfree(header->object_prefix);
598 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500599
Alex Elder00f1f362012-02-07 12:03:36 -0600600 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700601}
602
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700603static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
604 u64 *seq, u64 *size)
605{
606 int i;
607 char *p = header->snap_names;
608
Alex Elder00f1f362012-02-07 12:03:36 -0600609 for (i = 0; i < header->total_snaps; i++) {
610 if (!strcmp(snap_name, p)) {
611
612 /* Found it. Pass back its id and/or size */
613
614 if (seq)
615 *seq = header->snapc->snaps[i];
616 if (size)
617 *size = header->snap_sizes[i];
618 return i;
619 }
620 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621 }
Alex Elder00f1f362012-02-07 12:03:36 -0600622 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700623}
624
Alex Elder0ce1a792012-07-03 16:01:18 -0500625static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626{
Alex Elder78dc4472012-07-19 08:49:18 -0500627 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700628
Alex Elder0ce1a792012-07-03 16:01:18 -0500629 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700630
Alex Elder0ce1a792012-07-03 16:01:18 -0500631 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800632 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500633 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800634 rbd_dev->snap_exists = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700635 rbd_dev->read_only = rbd_dev->rbd_opts.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500637 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500639 u64 snap_id = 0;
640
641 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
642 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643 if (ret < 0)
644 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500645 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800646 rbd_dev->snap_exists = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700647 rbd_dev->read_only = true; /* No choice for snapshots */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648 }
649
650 ret = 0;
651done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500652 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700653 return ret;
654}
655
656static void rbd_header_free(struct rbd_image_header *header)
657{
Alex Elder849b4262012-07-09 21:04:24 -0500658 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500659 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500661 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500662 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500663 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800664 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500665 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666}
667
668/*
669 * get the actual striped segment name, offset and length
670 */
671static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500672 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673 u64 ofs, u64 len,
674 char *seg_name, u64 *segofs)
675{
676 u64 seg = ofs >> header->obj_order;
677
678 if (seg_name)
679 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500680 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681
682 ofs = ofs & ((1 << header->obj_order) - 1);
683 len = min_t(u64, len, (1 << header->obj_order) - ofs);
684
685 if (segofs)
686 *segofs = ofs;
687
688 return len;
689}
690
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700691static int rbd_get_num_segments(struct rbd_image_header *header,
692 u64 ofs, u64 len)
693{
694 u64 start_seg = ofs >> header->obj_order;
695 u64 end_seg = (ofs + len - 1) >> header->obj_order;
696 return end_seg - start_seg + 1;
697}
698
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700700 * returns the size of an object in the image
701 */
702static u64 rbd_obj_bytes(struct rbd_image_header *header)
703{
704 return 1 << header->obj_order;
705}
706
707/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708 * bio helpers
709 */
710
711static void bio_chain_put(struct bio *chain)
712{
713 struct bio *tmp;
714
715 while (chain) {
716 tmp = chain;
717 chain = chain->bi_next;
718 bio_put(tmp);
719 }
720}
721
722/*
723 * zeros a bio chain, starting at specific offset
724 */
725static void zero_bio_chain(struct bio *chain, int start_ofs)
726{
727 struct bio_vec *bv;
728 unsigned long flags;
729 void *buf;
730 int i;
731 int pos = 0;
732
733 while (chain) {
734 bio_for_each_segment(bv, chain, i) {
735 if (pos + bv->bv_len > start_ofs) {
736 int remainder = max(start_ofs - pos, 0);
737 buf = bvec_kmap_irq(bv, &flags);
738 memset(buf + remainder, 0,
739 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200740 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700741 }
742 pos += bv->bv_len;
743 }
744
745 chain = chain->bi_next;
746 }
747}
748
749/*
750 * bio_chain_clone - clone a chain of bios up to a certain length.
751 * might return a bio_pair that will need to be released.
752 */
753static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
754 struct bio_pair **bp,
755 int len, gfp_t gfpmask)
756{
Alex Elder542582f2012-08-09 10:33:25 -0700757 struct bio *old_chain = *old;
758 struct bio *new_chain = NULL;
759 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700760 int total = 0;
761
762 if (*bp) {
763 bio_pair_release(*bp);
764 *bp = NULL;
765 }
766
767 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700768 struct bio *tmp;
769
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
771 if (!tmp)
772 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700773 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700774
775 if (total + old_chain->bi_size > len) {
776 struct bio_pair *bp;
777
778 /*
779 * this split can only happen with a single paged bio,
780 * split_bio will BUG_ON if this is not the case
781 */
782 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500783 "bi_size=%u\n",
784 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700785
786 /* split the bio. We'll release it either in the next
787 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600788 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700789 if (!bp)
790 goto err_out;
791
792 __bio_clone(tmp, &bp->bio1);
793
794 *next = &bp->bio2;
795 } else {
796 __bio_clone(tmp, old_chain);
797 *next = old_chain->bi_next;
798 }
799
800 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700801 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700802 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700804 else
805 new_chain = tmp;
806 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807 old_chain = old_chain->bi_next;
808
809 total += tmp->bi_size;
810 }
811
812 BUG_ON(total < len);
813
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700814 *old = old_chain;
815
816 return new_chain;
817
818err_out:
819 dout("bio_chain_clone with err\n");
820 bio_chain_put(new_chain);
821 return NULL;
822}
823
824/*
825 * helpers for osd request op vectors.
826 */
Alex Elder57cfc102012-06-26 12:57:03 -0700827static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
828 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700829{
Alex Elder57cfc102012-06-26 12:57:03 -0700830 struct ceph_osd_req_op *ops;
831
832 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
833 if (!ops)
834 return NULL;
835
836 ops[0].op = opcode;
837
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700838 /*
839 * op extent offset and length will be set later on
840 * in calc_raw_layout()
841 */
Alex Elder57cfc102012-06-26 12:57:03 -0700842 ops[0].payload_len = payload_len;
843
844 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845}
846
847static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
848{
849 kfree(ops);
850}
851
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700852static void rbd_coll_end_req_index(struct request *rq,
853 struct rbd_req_coll *coll,
854 int index,
855 int ret, u64 len)
856{
857 struct request_queue *q;
858 int min, max, i;
859
Alex Elderbd919d42012-07-13 20:35:11 -0500860 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
861 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700862
863 if (!rq)
864 return;
865
866 if (!coll) {
867 blk_end_request(rq, ret, len);
868 return;
869 }
870
871 q = rq->q;
872
873 spin_lock_irq(q->queue_lock);
874 coll->status[index].done = 1;
875 coll->status[index].rc = ret;
876 coll->status[index].bytes = len;
877 max = min = coll->num_done;
878 while (max < coll->total && coll->status[max].done)
879 max++;
880
881 for (i = min; i<max; i++) {
882 __blk_end_request(rq, coll->status[i].rc,
883 coll->status[i].bytes);
884 coll->num_done++;
885 kref_put(&coll->kref, rbd_coll_release);
886 }
887 spin_unlock_irq(q->queue_lock);
888}
889
890static void rbd_coll_end_req(struct rbd_request *req,
891 int ret, u64 len)
892{
893 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
894}
895
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896/*
897 * Send ceph osd request
898 */
899static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500900 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901 struct ceph_snap_context *snapc,
902 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500903 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700904 struct bio *bio,
905 struct page **pages,
906 int num_pages,
907 int flags,
908 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700909 struct rbd_req_coll *coll,
910 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700912 struct ceph_msg *msg),
913 struct ceph_osd_request **linger_req,
914 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700915{
916 struct ceph_osd_request *req;
917 struct ceph_file_layout *layout;
918 int ret;
919 u64 bno;
920 struct timespec mtime = CURRENT_TIME;
921 struct rbd_request *req_data;
922 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600923 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700924
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700926 if (!req_data) {
927 if (coll)
928 rbd_coll_end_req_index(rq, coll, coll_index,
929 -ENOMEM, len);
930 return -ENOMEM;
931 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700932
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700933 if (coll) {
934 req_data->coll = coll;
935 req_data->coll_index = coll_index;
936 }
937
Alex Elderbd919d42012-07-13 20:35:11 -0500938 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
939 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700940
Alex Elder0ce1a792012-07-03 16:01:18 -0500941 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600942 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
943 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700944 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700945 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700946 goto done_pages;
947 }
948
949 req->r_callback = rbd_cb;
950
951 req_data->rq = rq;
952 req_data->bio = bio;
953 req_data->pages = pages;
954 req_data->len = len;
955
956 req->r_priv = req_data;
957
958 reqhead = req->r_request->front.iov_base;
959 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
960
Alex Elderaded07e2012-07-03 16:01:18 -0500961 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962 req->r_oid_len = strlen(req->r_oid);
963
964 layout = &req->r_file_layout;
965 memset(layout, 0, sizeof(*layout));
966 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
967 layout->fl_stripe_count = cpu_to_le32(1);
968 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500969 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600970 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
971 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700972
973 ceph_osdc_build_request(req, ofs, &len,
974 ops,
975 snapc,
976 &mtime,
977 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700978
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700979 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600980 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700981 *linger_req = req;
982 }
983
Alex Elder1dbb4392012-01-24 10:08:37 -0600984 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985 if (ret < 0)
986 goto done_err;
987
988 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600989 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700990 if (ver)
991 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -0500992 dout("reassert_ver=%llu\n",
993 (unsigned long long)
994 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995 ceph_osdc_put_request(req);
996 }
997 return ret;
998
999done_err:
1000 bio_chain_put(req_data->bio);
1001 ceph_osdc_put_request(req);
1002done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001003 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005 return ret;
1006}
1007
1008/*
1009 * Ceph osd op callback
1010 */
1011static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1012{
1013 struct rbd_request *req_data = req->r_priv;
1014 struct ceph_osd_reply_head *replyhead;
1015 struct ceph_osd_op *op;
1016 __s32 rc;
1017 u64 bytes;
1018 int read_op;
1019
1020 /* parse reply */
1021 replyhead = msg->front.iov_base;
1022 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1023 op = (void *)(replyhead + 1);
1024 rc = le32_to_cpu(replyhead->result);
1025 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001026 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001027
Alex Elderbd919d42012-07-13 20:35:11 -05001028 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1029 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030
1031 if (rc == -ENOENT && read_op) {
1032 zero_bio_chain(req_data->bio, 0);
1033 rc = 0;
1034 } else if (rc == 0 && read_op && bytes < req_data->len) {
1035 zero_bio_chain(req_data->bio, bytes);
1036 bytes = req_data->len;
1037 }
1038
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001039 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040
1041 if (req_data->bio)
1042 bio_chain_put(req_data->bio);
1043
1044 ceph_osdc_put_request(req);
1045 kfree(req_data);
1046}
1047
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001048static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1049{
1050 ceph_osdc_put_request(req);
1051}
1052
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001053/*
1054 * Do a synchronous ceph osd operation
1055 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001056static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001057 struct ceph_snap_context *snapc,
1058 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001059 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001060 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001061 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001062 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001063 char *buf,
1064 struct ceph_osd_request **linger_req,
1065 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001066{
1067 int ret;
1068 struct page **pages;
1069 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001070
1071 BUG_ON(ops == NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001072
1073 num_pages = calc_pages_for(ofs , len);
1074 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001075 if (IS_ERR(pages))
1076 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001077
Alex Elder0ce1a792012-07-03 16:01:18 -05001078 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001079 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 pages, num_pages,
1081 flags,
1082 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001083 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001084 NULL,
1085 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001086 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001087 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088
1089 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1090 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1091
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001092done:
1093 ceph_release_page_vector(pages, num_pages);
1094 return ret;
1095}
1096
1097/*
1098 * Do an asynchronous ceph osd operation
1099 */
1100static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001101 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 struct ceph_snap_context *snapc,
1103 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001104 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001105 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001106 struct bio *bio,
1107 struct rbd_req_coll *coll,
1108 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109{
1110 char *seg_name;
1111 u64 seg_ofs;
1112 u64 seg_len;
1113 int ret;
1114 struct ceph_osd_req_op *ops;
1115 u32 payload_len;
1116
1117 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1118 if (!seg_name)
1119 return -ENOMEM;
1120
1121 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001122 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123 ofs, len,
1124 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001125
1126 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1127
Alex Elder57cfc102012-06-26 12:57:03 -07001128 ret = -ENOMEM;
1129 ops = rbd_create_rw_ops(1, opcode, payload_len);
1130 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001131 goto done;
1132
1133 /* we've taken care of segment sizes earlier when we
1134 cloned the bios. We should never have a segment
1135 truncated at this point */
1136 BUG_ON(seg_len < len);
1137
1138 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1139 seg_name, seg_ofs, seg_len,
1140 bio,
1141 NULL, 0,
1142 flags,
1143 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001144 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001145 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001146
1147 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148done:
1149 kfree(seg_name);
1150 return ret;
1151}
1152
1153/*
1154 * Request async osd write
1155 */
1156static int rbd_req_write(struct request *rq,
1157 struct rbd_device *rbd_dev,
1158 struct ceph_snap_context *snapc,
1159 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001160 struct bio *bio,
1161 struct rbd_req_coll *coll,
1162 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001163{
1164 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1165 CEPH_OSD_OP_WRITE,
1166 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001167 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168}
1169
1170/*
1171 * Request async osd read
1172 */
1173static int rbd_req_read(struct request *rq,
1174 struct rbd_device *rbd_dev,
1175 u64 snapid,
1176 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001177 struct bio *bio,
1178 struct rbd_req_coll *coll,
1179 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180{
1181 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001182 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001183 CEPH_OSD_OP_READ,
1184 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001185 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001186}
1187
1188/*
1189 * Request sync osd read
1190 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001191static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001192 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001193 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001194 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001195 char *buf,
1196 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001197{
Alex Elder913d2fd2012-06-26 12:57:03 -07001198 struct ceph_osd_req_op *ops;
1199 int ret;
1200
1201 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1202 if (!ops)
1203 return -ENOMEM;
1204
1205 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001206 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001207 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001208 ops, object_name, ofs, len, buf, NULL, ver);
1209 rbd_destroy_ops(ops);
1210
1211 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212}
1213
1214/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001215 * Request sync osd watch
1216 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001217static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001218 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001219 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001220{
1221 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001222 int ret;
1223
Alex Elder57cfc102012-06-26 12:57:03 -07001224 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1225 if (!ops)
1226 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001227
Josh Durgina71b8912011-12-05 18:10:44 -08001228 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001229 ops[0].watch.cookie = notify_id;
1230 ops[0].watch.flag = 0;
1231
Alex Elder0ce1a792012-07-03 16:01:18 -05001232 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001233 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001234 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001235 CEPH_OSD_FLAG_READ,
1236 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001237 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001238 rbd_simple_req_cb, 0, NULL);
1239
1240 rbd_destroy_ops(ops);
1241 return ret;
1242}
1243
1244static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1245{
Alex Elder0ce1a792012-07-03 16:01:18 -05001246 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001247 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001248 int rc;
1249
Alex Elder0ce1a792012-07-03 16:01:18 -05001250 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001251 return;
1252
Alex Elderbd919d42012-07-13 20:35:11 -05001253 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1254 rbd_dev->header_name, (unsigned long long) notify_id,
1255 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001256 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001257 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001258 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001259 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260
Alex Elder7f0a24d2012-07-25 09:32:40 -05001261 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001262}
1263
1264/*
1265 * Request sync osd watch
1266 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001267static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001268{
1269 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001270 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001271 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001272
Alex Elder57cfc102012-06-26 12:57:03 -07001273 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1274 if (!ops)
1275 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001276
1277 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001278 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001279 if (ret < 0)
1280 goto fail;
1281
Alex Elder0e6f3222012-07-25 09:32:40 -05001282 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001283 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001284 ops[0].watch.flag = 1;
1285
Alex Elder0ce1a792012-07-03 16:01:18 -05001286 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001287 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001288 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1289 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001290 rbd_dev->header_name,
1291 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001292 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001293
1294 if (ret < 0)
1295 goto fail_event;
1296
1297 rbd_destroy_ops(ops);
1298 return 0;
1299
1300fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001301 ceph_osdc_cancel_event(rbd_dev->watch_event);
1302 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001303fail:
1304 rbd_destroy_ops(ops);
1305 return ret;
1306}
1307
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001308/*
1309 * Request sync osd unwatch
1310 */
Alex Elder070c6332012-07-25 09:32:41 -05001311static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001312{
1313 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001314 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001315
Alex Elder57cfc102012-06-26 12:57:03 -07001316 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1317 if (!ops)
1318 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001319
1320 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001321 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001322 ops[0].watch.flag = 0;
1323
Alex Elder0ce1a792012-07-03 16:01:18 -05001324 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001325 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001326 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1327 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001328 rbd_dev->header_name,
1329 0, 0, NULL, NULL, NULL);
1330
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001331
1332 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001333 ceph_osdc_cancel_event(rbd_dev->watch_event);
1334 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001335 return ret;
1336}
1337
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001339 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001340};
1341
1342static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1343{
Alex Elder0ce1a792012-07-03 16:01:18 -05001344 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1345 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001346 return;
1347
Alex Elderbd919d42012-07-13 20:35:11 -05001348 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1349 rbd_dev->header_name, (unsigned long long) notify_id,
1350 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001351}
1352
1353/*
1354 * Request sync osd notify
1355 */
Alex Elder4cb16252012-07-25 09:32:40 -05001356static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001357{
1358 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001359 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001360 struct ceph_osd_event *event;
1361 struct rbd_notify_info info;
1362 int payload_len = sizeof(u32) + sizeof(u32);
1363 int ret;
1364
Alex Elder57cfc102012-06-26 12:57:03 -07001365 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1366 if (!ops)
1367 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001368
Alex Elder0ce1a792012-07-03 16:01:18 -05001369 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001370
1371 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1372 (void *)&info, &event);
1373 if (ret < 0)
1374 goto fail;
1375
1376 ops[0].watch.ver = 1;
1377 ops[0].watch.flag = 1;
1378 ops[0].watch.cookie = event->cookie;
1379 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1380 ops[0].watch.timeout = 12;
1381
Alex Elder0ce1a792012-07-03 16:01:18 -05001382 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001383 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001384 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1385 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001386 rbd_dev->header_name,
1387 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001388 if (ret < 0)
1389 goto fail_event;
1390
1391 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1392 dout("ceph_osdc_wait_event returned %d\n", ret);
1393 rbd_destroy_ops(ops);
1394 return 0;
1395
1396fail_event:
1397 ceph_osdc_cancel_event(event);
1398fail:
1399 rbd_destroy_ops(ops);
1400 return ret;
1401}
1402
1403/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001404 * Request sync osd read
1405 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001406static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001407 const char *object_name,
1408 const char *class_name,
1409 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001410 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001411 int len,
1412 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001413{
1414 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001415 int class_name_len = strlen(class_name);
1416 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001417 int ret;
1418
1419 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001420 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001421 if (!ops)
1422 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001423
Alex Elderaded07e2012-07-03 16:01:18 -05001424 ops[0].cls.class_name = class_name;
1425 ops[0].cls.class_len = (__u8) class_name_len;
1426 ops[0].cls.method_name = method_name;
1427 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001428 ops[0].cls.argc = 0;
1429 ops[0].cls.indata = data;
1430 ops[0].cls.indata_len = len;
1431
Alex Elder0ce1a792012-07-03 16:01:18 -05001432 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001433 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001434 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1435 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001436 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001437
1438 rbd_destroy_ops(ops);
1439
1440 dout("cls_exec returned %d\n", ret);
1441 return ret;
1442}
1443
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001444static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1445{
1446 struct rbd_req_coll *coll =
1447 kzalloc(sizeof(struct rbd_req_coll) +
1448 sizeof(struct rbd_req_status) * num_reqs,
1449 GFP_ATOMIC);
1450
1451 if (!coll)
1452 return NULL;
1453 coll->total = num_reqs;
1454 kref_init(&coll->kref);
1455 return coll;
1456}
1457
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001458/*
1459 * block device queue callback
1460 */
1461static void rbd_rq_fn(struct request_queue *q)
1462{
1463 struct rbd_device *rbd_dev = q->queuedata;
1464 struct request *rq;
1465 struct bio_pair *bp = NULL;
1466
Alex Elder00f1f362012-02-07 12:03:36 -06001467 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001468 struct bio *bio;
1469 struct bio *rq_bio, *next_bio = NULL;
1470 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001471 unsigned int size;
1472 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001474 int num_segs, cur_seg = 0;
1475 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001476 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001477
1478 /* peek at request from block layer */
1479 if (!rq)
1480 break;
1481
1482 dout("fetched request\n");
1483
1484 /* filter out block requests we don't understand */
1485 if ((rq->cmd_type != REQ_TYPE_FS)) {
1486 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001487 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001488 }
1489
1490 /* deduce our operation (read, write) */
1491 do_write = (rq_data_dir(rq) == WRITE);
1492
1493 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001494 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001495 rq_bio = rq->bio;
1496 if (do_write && rbd_dev->read_only) {
1497 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001498 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001499 }
1500
1501 spin_unlock_irq(q->queue_lock);
1502
Josh Durgind1d25642011-12-05 14:03:05 -08001503 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001504
Josh Durgind1d25642011-12-05 14:03:05 -08001505 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001506 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001507 dout("request for non-existent snapshot");
1508 spin_lock_irq(q->queue_lock);
1509 __blk_end_request_all(rq, -ENXIO);
1510 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001511 }
1512
Josh Durgind1d25642011-12-05 14:03:05 -08001513 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1514
1515 up_read(&rbd_dev->header_rwsem);
1516
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001517 dout("%s 0x%x bytes at 0x%llx\n",
1518 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001519 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1522 coll = rbd_alloc_coll(num_segs);
1523 if (!coll) {
1524 spin_lock_irq(q->queue_lock);
1525 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001526 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001527 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001528 }
1529
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 do {
1531 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001532 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001534 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 ofs, size,
1536 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001537 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001538 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1539 op_size, GFP_ATOMIC);
1540 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001541 rbd_coll_end_req_index(rq, coll, cur_seg,
1542 -ENOMEM, op_size);
1543 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001544 }
1545
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001546
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001547 /* init OSD command: write or read */
1548 if (do_write)
1549 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001550 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001551 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001552 op_size, bio,
1553 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001554 else
1555 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001556 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001557 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001558 op_size, bio,
1559 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001560
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001561next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001562 size -= op_size;
1563 ofs += op_size;
1564
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001565 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001566 rq_bio = next_bio;
1567 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001568 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001569
1570 if (bp)
1571 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001572 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001573
1574 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001575 }
1576}
1577
1578/*
1579 * a queue callback. Makes sure that we don't create a bio that spans across
1580 * multiple osd objects. One exception would be with a single page bios,
1581 * which we handle later at bio_chain_clone
1582 */
1583static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1584 struct bio_vec *bvec)
1585{
1586 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001587 unsigned int chunk_sectors;
1588 sector_t sector;
1589 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590 int max;
1591
Alex Elder593a9e72012-02-07 12:03:37 -06001592 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1593 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1594 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1595
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001596 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001597 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001598 if (max < 0)
1599 max = 0; /* bio_add cannot handle a negative return */
1600 if (max <= bvec->bv_len && bio_sectors == 0)
1601 return bvec->bv_len;
1602 return max;
1603}
1604
1605static void rbd_free_disk(struct rbd_device *rbd_dev)
1606{
1607 struct gendisk *disk = rbd_dev->disk;
1608
1609 if (!disk)
1610 return;
1611
1612 rbd_header_free(&rbd_dev->header);
1613
1614 if (disk->flags & GENHD_FL_UP)
1615 del_gendisk(disk);
1616 if (disk->queue)
1617 blk_cleanup_queue(disk->queue);
1618 put_disk(disk);
1619}
1620
1621/*
Alex Elder4156d992012-08-02 11:29:46 -05001622 * Read the complete header for the given rbd device.
1623 *
1624 * Returns a pointer to a dynamically-allocated buffer containing
1625 * the complete and validated header. Caller can pass the address
1626 * of a variable that will be filled in with the version of the
1627 * header object at the time it was read.
1628 *
1629 * Returns a pointer-coded errno if a failure occurs.
1630 */
1631static struct rbd_image_header_ondisk *
1632rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1633{
1634 struct rbd_image_header_ondisk *ondisk = NULL;
1635 u32 snap_count = 0;
1636 u64 names_size = 0;
1637 u32 want_count;
1638 int ret;
1639
1640 /*
1641 * The complete header will include an array of its 64-bit
1642 * snapshot ids, followed by the names of those snapshots as
1643 * a contiguous block of NUL-terminated strings. Note that
1644 * the number of snapshots could change by the time we read
1645 * it in, in which case we re-read it.
1646 */
1647 do {
1648 size_t size;
1649
1650 kfree(ondisk);
1651
1652 size = sizeof (*ondisk);
1653 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1654 size += names_size;
1655 ondisk = kmalloc(size, GFP_KERNEL);
1656 if (!ondisk)
1657 return ERR_PTR(-ENOMEM);
1658
1659 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1660 rbd_dev->header_name,
1661 0, size,
1662 (char *) ondisk, version);
1663
1664 if (ret < 0)
1665 goto out_err;
1666 if (WARN_ON((size_t) ret < size)) {
1667 ret = -ENXIO;
1668 pr_warning("short header read for image %s"
1669 " (want %zd got %d)\n",
1670 rbd_dev->image_name, size, ret);
1671 goto out_err;
1672 }
1673 if (!rbd_dev_ondisk_valid(ondisk)) {
1674 ret = -ENXIO;
1675 pr_warning("invalid header for image %s\n",
1676 rbd_dev->image_name);
1677 goto out_err;
1678 }
1679
1680 names_size = le64_to_cpu(ondisk->snap_names_len);
1681 want_count = snap_count;
1682 snap_count = le32_to_cpu(ondisk->snap_count);
1683 } while (snap_count != want_count);
1684
1685 return ondisk;
1686
1687out_err:
1688 kfree(ondisk);
1689
1690 return ERR_PTR(ret);
1691}
1692
1693/*
1694 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001695 */
1696static int rbd_read_header(struct rbd_device *rbd_dev,
1697 struct rbd_image_header *header)
1698{
Alex Elder4156d992012-08-02 11:29:46 -05001699 struct rbd_image_header_ondisk *ondisk;
1700 u64 ver = 0;
1701 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001702
Alex Elder4156d992012-08-02 11:29:46 -05001703 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1704 if (IS_ERR(ondisk))
1705 return PTR_ERR(ondisk);
1706 ret = rbd_header_from_disk(header, ondisk);
1707 if (ret >= 0)
1708 header->obj_version = ver;
1709 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001710
Alex Elder4156d992012-08-02 11:29:46 -05001711 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001712}
1713
1714/*
1715 * create a snapshot
1716 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001717static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001718 const char *snap_name,
1719 gfp_t gfp_flags)
1720{
1721 int name_len = strlen(snap_name);
1722 u64 new_snapid;
1723 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001724 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001725 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001726
1727 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001728 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001729 return -EINVAL;
1730
Alex Elder0ce1a792012-07-03 16:01:18 -05001731 monc = &rbd_dev->rbd_client->client->monc;
1732 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001733 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001734 if (ret < 0)
1735 return ret;
1736
1737 data = kmalloc(name_len + 16, gfp_flags);
1738 if (!data)
1739 return -ENOMEM;
1740
Sage Weil916d4d62011-05-12 16:10:50 -07001741 p = data;
1742 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001743
Sage Weil916d4d62011-05-12 16:10:50 -07001744 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1745 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746
Alex Elder0bed54d2012-07-03 16:01:18 -05001747 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001748 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001749 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001750
Sage Weil916d4d62011-05-12 16:10:50 -07001751 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001752
Alex Elder505cbb92012-07-19 08:49:18 -05001753 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001754bad:
1755 return -ERANGE;
1756}
1757
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001758static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1759{
1760 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001761 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001762
Alex Eldera0593292012-07-19 09:09:27 -05001763 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001764 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001765}
1766
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001767/*
1768 * only read the first part of the ondisk header, without the snaps info
1769 */
Alex Elderb8136232012-07-25 09:32:41 -05001770static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001771{
1772 int ret;
1773 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001774
1775 ret = rbd_read_header(rbd_dev, &h);
1776 if (ret < 0)
1777 return ret;
1778
Josh Durgina51aa0c2011-12-05 10:35:04 -08001779 down_write(&rbd_dev->header_rwsem);
1780
Sage Weil9db4b3e2011-04-19 22:49:06 -07001781 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001782 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1783 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1784
1785 dout("setting size to %llu sectors", (unsigned long long) size);
1786 set_capacity(rbd_dev->disk, size);
1787 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001788
Alex Elder849b4262012-07-09 21:04:24 -05001789 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001790 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001791 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001792 /* osd requests may still refer to snapc */
1793 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001794
Alex Elderb8136232012-07-25 09:32:41 -05001795 if (hver)
1796 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001797 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001798 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799 rbd_dev->header.total_snaps = h.total_snaps;
1800 rbd_dev->header.snapc = h.snapc;
1801 rbd_dev->header.snap_names = h.snap_names;
1802 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001803 /* Free the extra copy of the object prefix */
1804 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1805 kfree(h.object_prefix);
1806
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001807 ret = __rbd_init_snaps_header(rbd_dev);
1808
Josh Durginc6666012011-11-21 17:11:12 -08001809 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001810
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001811 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812}
1813
Alex Elder1fe5e992012-07-25 09:32:41 -05001814static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1815{
1816 int ret;
1817
1818 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1819 ret = __rbd_refresh_header(rbd_dev, hver);
1820 mutex_unlock(&ctl_mutex);
1821
1822 return ret;
1823}
1824
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001825static int rbd_init_disk(struct rbd_device *rbd_dev)
1826{
1827 struct gendisk *disk;
1828 struct request_queue *q;
1829 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001830 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001831 u64 total_size = 0;
1832
1833 /* contact OSD, request size info about the object being mapped */
1834 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1835 if (rc)
1836 return rc;
1837
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001838 /* no need to lock here, as rbd_dev is not registered yet */
1839 rc = __rbd_init_snaps_header(rbd_dev);
1840 if (rc)
1841 return rc;
1842
Josh Durgincc9d7342011-11-21 18:19:13 -08001843 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001844 if (rc)
1845 return rc;
1846
1847 /* create gendisk info */
1848 rc = -ENOMEM;
1849 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1850 if (!disk)
1851 goto out;
1852
Alex Elderf0f8cef2012-01-29 13:57:44 -06001853 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001854 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001855 disk->major = rbd_dev->major;
1856 disk->first_minor = 0;
1857 disk->fops = &rbd_bd_ops;
1858 disk->private_data = rbd_dev;
1859
1860 /* init rq */
1861 rc = -ENOMEM;
1862 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1863 if (!q)
1864 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001865
Alex Elder593a9e72012-02-07 12:03:37 -06001866 /* We use the default size, but let's be explicit about it. */
1867 blk_queue_physical_block_size(q, SECTOR_SIZE);
1868
Josh Durgin029bcbd2011-07-22 11:35:23 -07001869 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001870 segment_size = rbd_obj_bytes(&rbd_dev->header);
1871 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1872 blk_queue_max_segment_size(q, segment_size);
1873 blk_queue_io_min(q, segment_size);
1874 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001875
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001876 blk_queue_merge_bvec(q, rbd_merge_bvec);
1877 disk->queue = q;
1878
1879 q->queuedata = rbd_dev;
1880
1881 rbd_dev->disk = disk;
1882 rbd_dev->q = q;
1883
1884 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001885 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001886 add_disk(disk);
1887
1888 pr_info("%s: added with size 0x%llx\n",
1889 disk->disk_name, (unsigned long long)total_size);
1890 return 0;
1891
1892out_disk:
1893 put_disk(disk);
1894out:
1895 return rc;
1896}
1897
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001898/*
1899 sysfs
1900*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001901
Alex Elder593a9e72012-02-07 12:03:37 -06001902static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1903{
1904 return container_of(dev, struct rbd_device, dev);
1905}
1906
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001907static ssize_t rbd_size_show(struct device *dev,
1908 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001909{
Alex Elder593a9e72012-02-07 12:03:37 -06001910 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001911 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001912
Josh Durgina51aa0c2011-12-05 10:35:04 -08001913 down_read(&rbd_dev->header_rwsem);
1914 size = get_capacity(rbd_dev->disk);
1915 up_read(&rbd_dev->header_rwsem);
1916
1917 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001918}
1919
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001920static ssize_t rbd_major_show(struct device *dev,
1921 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001922{
Alex Elder593a9e72012-02-07 12:03:37 -06001923 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001924
1925 return sprintf(buf, "%d\n", rbd_dev->major);
1926}
1927
1928static ssize_t rbd_client_id_show(struct device *dev,
1929 struct device_attribute *attr, char *buf)
1930{
Alex Elder593a9e72012-02-07 12:03:37 -06001931 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001932
Alex Elder1dbb4392012-01-24 10:08:37 -06001933 return sprintf(buf, "client%lld\n",
1934 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001935}
1936
1937static ssize_t rbd_pool_show(struct device *dev,
1938 struct device_attribute *attr, char *buf)
1939{
Alex Elder593a9e72012-02-07 12:03:37 -06001940 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001941
1942 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1943}
1944
Alex Elder9bb2f332012-07-12 10:46:35 -05001945static ssize_t rbd_pool_id_show(struct device *dev,
1946 struct device_attribute *attr, char *buf)
1947{
1948 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1949
1950 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1951}
1952
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001953static ssize_t rbd_name_show(struct device *dev,
1954 struct device_attribute *attr, char *buf)
1955{
Alex Elder593a9e72012-02-07 12:03:37 -06001956 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001957
Alex Elder0bed54d2012-07-03 16:01:18 -05001958 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001959}
1960
1961static ssize_t rbd_snap_show(struct device *dev,
1962 struct device_attribute *attr,
1963 char *buf)
1964{
Alex Elder593a9e72012-02-07 12:03:37 -06001965 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001966
1967 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1968}
1969
1970static ssize_t rbd_image_refresh(struct device *dev,
1971 struct device_attribute *attr,
1972 const char *buf,
1973 size_t size)
1974{
Alex Elder593a9e72012-02-07 12:03:37 -06001975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001976 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001977
Alex Elder1fe5e992012-07-25 09:32:41 -05001978 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001979
1980 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001981}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001982
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001983static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1984static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1985static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1986static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001987static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001988static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1989static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1990static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1991static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001992
1993static struct attribute *rbd_attrs[] = {
1994 &dev_attr_size.attr,
1995 &dev_attr_major.attr,
1996 &dev_attr_client_id.attr,
1997 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001998 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001999 &dev_attr_name.attr,
2000 &dev_attr_current_snap.attr,
2001 &dev_attr_refresh.attr,
2002 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002003 NULL
2004};
2005
2006static struct attribute_group rbd_attr_group = {
2007 .attrs = rbd_attrs,
2008};
2009
2010static const struct attribute_group *rbd_attr_groups[] = {
2011 &rbd_attr_group,
2012 NULL
2013};
2014
2015static void rbd_sysfs_dev_release(struct device *dev)
2016{
2017}
2018
2019static struct device_type rbd_device_type = {
2020 .name = "rbd",
2021 .groups = rbd_attr_groups,
2022 .release = rbd_sysfs_dev_release,
2023};
2024
2025
2026/*
2027 sysfs - snapshots
2028*/
2029
2030static ssize_t rbd_snap_size_show(struct device *dev,
2031 struct device_attribute *attr,
2032 char *buf)
2033{
2034 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2035
Josh Durgin35915382011-12-05 18:25:13 -08002036 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002037}
2038
2039static ssize_t rbd_snap_id_show(struct device *dev,
2040 struct device_attribute *attr,
2041 char *buf)
2042{
2043 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2044
Josh Durgin35915382011-12-05 18:25:13 -08002045 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002046}
2047
2048static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2049static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2050
2051static struct attribute *rbd_snap_attrs[] = {
2052 &dev_attr_snap_size.attr,
2053 &dev_attr_snap_id.attr,
2054 NULL,
2055};
2056
2057static struct attribute_group rbd_snap_attr_group = {
2058 .attrs = rbd_snap_attrs,
2059};
2060
2061static void rbd_snap_dev_release(struct device *dev)
2062{
2063 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2064 kfree(snap->name);
2065 kfree(snap);
2066}
2067
2068static const struct attribute_group *rbd_snap_attr_groups[] = {
2069 &rbd_snap_attr_group,
2070 NULL
2071};
2072
2073static struct device_type rbd_snap_device_type = {
2074 .groups = rbd_snap_attr_groups,
2075 .release = rbd_snap_dev_release,
2076};
2077
Alex Elder14e70852012-07-19 09:09:27 -05002078static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002079{
2080 list_del(&snap->node);
2081 device_unregister(&snap->dev);
2082}
2083
Alex Elder14e70852012-07-19 09:09:27 -05002084static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002085 struct device *parent)
2086{
2087 struct device *dev = &snap->dev;
2088 int ret;
2089
2090 dev->type = &rbd_snap_device_type;
2091 dev->parent = parent;
2092 dev->release = rbd_snap_dev_release;
2093 dev_set_name(dev, "snap_%s", snap->name);
2094 ret = device_register(dev);
2095
2096 return ret;
2097}
2098
Alex Elder4e891e02012-07-10 20:30:10 -05002099static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2100 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002101{
Alex Elder4e891e02012-07-10 20:30:10 -05002102 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002103 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002104
2105 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002106 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002107 return ERR_PTR(-ENOMEM);
2108
2109 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002110 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002111 if (!snap->name)
2112 goto err;
2113
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002114 snap->size = rbd_dev->header.snap_sizes[i];
2115 snap->id = rbd_dev->header.snapc->snaps[i];
2116 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002117 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002118 if (ret < 0)
2119 goto err;
2120 }
Alex Elder4e891e02012-07-10 20:30:10 -05002121
2122 return snap;
2123
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002124err:
2125 kfree(snap->name);
2126 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002127
2128 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002129}
2130
2131/*
Alex Elder35938152012-08-02 11:29:46 -05002132 * Scan the rbd device's current snapshot list and compare it to the
2133 * newly-received snapshot context. Remove any existing snapshots
2134 * not present in the new snapshot context. Add a new snapshot for
2135 * any snaphots in the snapshot context not in the current list.
2136 * And verify there are no changes to snapshots we already know
2137 * about.
2138 *
2139 * Assumes the snapshots in the snapshot context are sorted by
2140 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2141 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002142 */
2143static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2144{
Alex Elder35938152012-08-02 11:29:46 -05002145 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2146 const u32 snap_count = snapc->num_snaps;
2147 char *snap_name = rbd_dev->header.snap_names;
2148 struct list_head *head = &rbd_dev->snaps;
2149 struct list_head *links = head->next;
2150 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002151
Alex Elder35938152012-08-02 11:29:46 -05002152 while (index < snap_count || links != head) {
2153 u64 snap_id;
2154 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002155
Alex Elder35938152012-08-02 11:29:46 -05002156 snap_id = index < snap_count ? snapc->snaps[index]
2157 : CEPH_NOSNAP;
2158 snap = links != head ? list_entry(links, struct rbd_snap, node)
2159 : NULL;
2160 BUG_ON(snap && snap->id == CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002161
Alex Elder35938152012-08-02 11:29:46 -05002162 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2163 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002164
Alex Elder35938152012-08-02 11:29:46 -05002165 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166
Alex Elder35938152012-08-02 11:29:46 -05002167 if (rbd_dev->snap_id == snap->id)
Josh Durgine88a36e2011-11-21 18:14:25 -08002168 rbd_dev->snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002169 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002170
Alex Elder35938152012-08-02 11:29:46 -05002171 /* Done with this list entry; advance */
2172
2173 links = next;
2174 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002175 }
Alex Elder35938152012-08-02 11:29:46 -05002176
2177 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2178 struct rbd_snap *new_snap;
2179
2180 /* We haven't seen this snapshot before */
2181
2182 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2183 snap_name);
2184 if (IS_ERR(new_snap))
2185 return PTR_ERR(new_snap);
2186
2187 /* New goes before existing, or at end of list */
2188
2189 if (snap)
2190 list_add_tail(&new_snap->node, &snap->node);
2191 else
Alex Elder523f3252012-08-30 00:16:37 -05002192 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002193 } else {
2194 /* Already have this one */
2195
2196 BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]);
2197 BUG_ON(strcmp(snap->name, snap_name));
2198
2199 /* Done with this list entry; advance */
2200
2201 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002202 }
Alex Elder35938152012-08-02 11:29:46 -05002203
2204 /* Advance to the next entry in the snapshot context */
2205
2206 index++;
2207 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002208 }
2209
2210 return 0;
2211}
2212
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002213static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2214{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002215 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002216 struct device *dev;
2217 struct rbd_snap *snap;
2218
2219 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2220 dev = &rbd_dev->dev;
2221
2222 dev->bus = &rbd_bus_type;
2223 dev->type = &rbd_device_type;
2224 dev->parent = &rbd_root_dev;
2225 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002226 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002227 ret = device_register(dev);
2228 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002229 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002230
2231 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002232 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002233 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002234 break;
2235 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002236out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002237 mutex_unlock(&ctl_mutex);
2238 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002239}
2240
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002241static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2242{
2243 device_unregister(&rbd_dev->dev);
2244}
2245
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002246static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2247{
2248 int ret, rc;
2249
2250 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002251 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002252 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002253 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002254 if (rc < 0)
2255 return rc;
2256 }
2257 } while (ret == -ERANGE);
2258
2259 return ret;
2260}
2261
Alex Elder1ddbe942012-01-29 13:57:44 -06002262static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2263
2264/*
Alex Elder499afd52012-02-02 08:13:29 -06002265 * Get a unique rbd identifier for the given new rbd_dev, and add
2266 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002267 */
Alex Elder499afd52012-02-02 08:13:29 -06002268static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002269{
Alex Elderde71a292012-07-03 16:01:19 -05002270 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002271
2272 spin_lock(&rbd_dev_list_lock);
2273 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2274 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002275}
Alex Elderb7f23c32012-01-29 13:57:43 -06002276
Alex Elder1ddbe942012-01-29 13:57:44 -06002277/*
Alex Elder499afd52012-02-02 08:13:29 -06002278 * Remove an rbd_dev from the global list, and record that its
2279 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002280 */
Alex Elder499afd52012-02-02 08:13:29 -06002281static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002282{
Alex Elderd184f6b2012-01-29 13:57:44 -06002283 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002284 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002285 int max_id;
2286
2287 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002288
2289 spin_lock(&rbd_dev_list_lock);
2290 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002291
2292 /*
2293 * If the id being "put" is not the current maximum, there
2294 * is nothing special we need to do.
2295 */
2296 if (rbd_id != atomic64_read(&rbd_id_max)) {
2297 spin_unlock(&rbd_dev_list_lock);
2298 return;
2299 }
2300
2301 /*
2302 * We need to update the current maximum id. Search the
2303 * list to find out what it is. We're more likely to find
2304 * the maximum at the end, so search the list backward.
2305 */
2306 max_id = 0;
2307 list_for_each_prev(tmp, &rbd_dev_list) {
2308 struct rbd_device *rbd_dev;
2309
2310 rbd_dev = list_entry(tmp, struct rbd_device, node);
2311 if (rbd_id > max_id)
2312 max_id = rbd_id;
2313 }
Alex Elder499afd52012-02-02 08:13:29 -06002314 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002315
Alex Elder1ddbe942012-01-29 13:57:44 -06002316 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002317 * The max id could have been updated by rbd_id_get(), in
2318 * which case it now accurately reflects the new maximum.
2319 * Be careful not to overwrite the maximum value in that
2320 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002321 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002322 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002323}
2324
Alex Eldera725f65e2012-02-02 08:13:30 -06002325/*
Alex Eldere28fff262012-02-02 08:13:30 -06002326 * Skips over white space at *buf, and updates *buf to point to the
2327 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002328 * the token (string of non-white space characters) found. Note
2329 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002330 */
2331static inline size_t next_token(const char **buf)
2332{
2333 /*
2334 * These are the characters that produce nonzero for
2335 * isspace() in the "C" and "POSIX" locales.
2336 */
2337 const char *spaces = " \f\n\r\t\v";
2338
2339 *buf += strspn(*buf, spaces); /* Find start of token */
2340
2341 return strcspn(*buf, spaces); /* Return token length */
2342}
2343
2344/*
2345 * Finds the next token in *buf, and if the provided token buffer is
2346 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002347 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2348 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002349 *
2350 * Returns the length of the token found (not including the '\0').
2351 * Return value will be 0 if no token is found, and it will be >=
2352 * token_size if the token would not fit.
2353 *
Alex Elder593a9e72012-02-07 12:03:37 -06002354 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002355 * found token. Note that this occurs even if the token buffer is
2356 * too small to hold it.
2357 */
2358static inline size_t copy_token(const char **buf,
2359 char *token,
2360 size_t token_size)
2361{
2362 size_t len;
2363
2364 len = next_token(buf);
2365 if (len < token_size) {
2366 memcpy(token, *buf, len);
2367 *(token + len) = '\0';
2368 }
2369 *buf += len;
2370
2371 return len;
2372}
2373
2374/*
Alex Elderea3352f2012-07-09 21:04:23 -05002375 * Finds the next token in *buf, dynamically allocates a buffer big
2376 * enough to hold a copy of it, and copies the token into the new
2377 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2378 * that a duplicate buffer is created even for a zero-length token.
2379 *
2380 * Returns a pointer to the newly-allocated duplicate, or a null
2381 * pointer if memory for the duplicate was not available. If
2382 * the lenp argument is a non-null pointer, the length of the token
2383 * (not including the '\0') is returned in *lenp.
2384 *
2385 * If successful, the *buf pointer will be updated to point beyond
2386 * the end of the found token.
2387 *
2388 * Note: uses GFP_KERNEL for allocation.
2389 */
2390static inline char *dup_token(const char **buf, size_t *lenp)
2391{
2392 char *dup;
2393 size_t len;
2394
2395 len = next_token(buf);
2396 dup = kmalloc(len + 1, GFP_KERNEL);
2397 if (!dup)
2398 return NULL;
2399
2400 memcpy(dup, *buf, len);
2401 *(dup + len) = '\0';
2402 *buf += len;
2403
2404 if (lenp)
2405 *lenp = len;
2406
2407 return dup;
2408}
2409
2410/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002411 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002412 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2413 * on the list of monitor addresses and other options provided via
2414 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002415 *
2416 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002417 */
2418static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2419 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002420 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002421 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002422 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002423 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002424{
Alex Elderd22f76e2012-07-12 10:46:35 -05002425 size_t len;
2426 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002427
2428 /* The first four tokens are required */
2429
Alex Elder7ef32142012-02-02 08:13:30 -06002430 len = next_token(&buf);
2431 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002432 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002433 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002434 *mon_addrs = buf;
2435
2436 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002437
Alex Eldere28fff262012-02-02 08:13:30 -06002438 len = copy_token(&buf, options, options_size);
2439 if (!len || len >= options_size)
2440 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002441
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002442 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002443 rbd_dev->pool_name = dup_token(&buf, NULL);
2444 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002445 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002446
Alex Elder0bed54d2012-07-03 16:01:18 -05002447 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2448 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002449 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002450
Alex Eldercb8627c2012-07-09 21:04:23 -05002451 /* Create the name of the header object */
2452
Alex Elder0bed54d2012-07-03 16:01:18 -05002453 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002454 + sizeof (RBD_SUFFIX),
2455 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002456 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002457 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002458 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002459
Alex Eldere28fff262012-02-02 08:13:30 -06002460 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002461 * The snapshot name is optional. If none is is supplied,
2462 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002463 */
Alex Elder820a5f32012-07-09 21:04:24 -05002464 rbd_dev->snap_name = dup_token(&buf, &len);
2465 if (!rbd_dev->snap_name)
2466 goto out_err;
2467 if (!len) {
2468 /* Replace the empty name with the default */
2469 kfree(rbd_dev->snap_name);
2470 rbd_dev->snap_name
2471 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2472 if (!rbd_dev->snap_name)
2473 goto out_err;
2474
Alex Eldere28fff262012-02-02 08:13:30 -06002475 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2476 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002477 }
Alex Eldere28fff262012-02-02 08:13:30 -06002478
Alex Eldera725f65e2012-02-02 08:13:30 -06002479 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002480
2481out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002482 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002483 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002484 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002485 rbd_dev->image_name = NULL;
2486 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002487 kfree(rbd_dev->pool_name);
2488 rbd_dev->pool_name = NULL;
2489
2490 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002491}
2492
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002493static ssize_t rbd_add(struct bus_type *bus,
2494 const char *buf,
2495 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002496{
Alex Eldercb8627c2012-07-09 21:04:23 -05002497 char *options;
2498 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002499 const char *mon_addrs = NULL;
2500 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002501 struct ceph_osd_client *osdc;
2502 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002503
2504 if (!try_module_get(THIS_MODULE))
2505 return -ENODEV;
2506
Alex Elder27cc2592012-02-02 08:13:30 -06002507 options = kmalloc(count, GFP_KERNEL);
2508 if (!options)
2509 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002510 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2511 if (!rbd_dev)
2512 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002513
2514 /* static rbd_device initialization */
2515 spin_lock_init(&rbd_dev->lock);
2516 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002517 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002518 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002519
Alex Elderd184f6b2012-01-29 13:57:44 -06002520 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002521 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002522
Alex Eldera725f65e2012-02-02 08:13:30 -06002523 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002524 BUILD_BUG_ON(DEV_NAME_LEN
2525 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002526 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a82f2012-01-29 13:57:44 -06002527
Alex Eldera725f65e2012-02-02 08:13:30 -06002528 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002529 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002530 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002531 if (rc)
2532 goto err_put_id;
2533
Alex Elderf8c38922012-08-10 13:12:07 -07002534 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2535 if (rc < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002536 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002537
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002538 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002539 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002540 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2541 if (rc < 0)
2542 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002543 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002544
2545 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002546 rc = register_blkdev(0, rbd_dev->name);
2547 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002548 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002549 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002550
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002551 rc = rbd_bus_add_dev(rbd_dev);
2552 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002553 goto err_out_blkdev;
2554
Alex Elder32eec682012-02-08 16:11:14 -06002555 /*
2556 * At this point cleanup in the event of an error is the job
2557 * of the sysfs code (initiated by rbd_bus_del_dev()).
2558 *
2559 * Set up and announce blkdev mapping.
2560 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002561 rc = rbd_init_disk(rbd_dev);
2562 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002563 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002564
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002565 rc = rbd_init_watch_dev(rbd_dev);
2566 if (rc)
2567 goto err_out_bus;
2568
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002569 return count;
2570
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002571err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002572 /* this will also clean up rest of rbd_dev stuff */
2573
2574 rbd_bus_del_dev(rbd_dev);
2575 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002576 return rc;
2577
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002578err_out_blkdev:
2579 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2580err_out_client:
2581 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002582err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002583 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002584 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002585 kfree(rbd_dev->header_name);
2586 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002587 kfree(rbd_dev->pool_name);
2588 }
Alex Elder499afd52012-02-02 08:13:29 -06002589 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002590err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002591 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002592 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002593
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002594 dout("Error adding device %s\n", buf);
2595 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002596
2597 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002598}
2599
Alex Elderde71a292012-07-03 16:01:19 -05002600static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002601{
2602 struct list_head *tmp;
2603 struct rbd_device *rbd_dev;
2604
Alex Eldere124a82f2012-01-29 13:57:44 -06002605 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002606 list_for_each(tmp, &rbd_dev_list) {
2607 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002608 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06002609 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002610 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002611 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002612 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002613 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002614 return NULL;
2615}
2616
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002617static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002618{
Alex Elder593a9e72012-02-07 12:03:37 -06002619 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002620
Alex Elder1dbb4392012-01-24 10:08:37 -06002621 if (rbd_dev->watch_request) {
2622 struct ceph_client *client = rbd_dev->rbd_client->client;
2623
2624 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002625 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002626 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002627 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002628 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002629
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002630 rbd_put_client(rbd_dev);
2631
2632 /* clean up and free blkdev */
2633 rbd_free_disk(rbd_dev);
2634 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002635
2636 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002637 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002638 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002639 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002640 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002641 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002642 kfree(rbd_dev);
2643
2644 /* release module ref */
2645 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002646}
2647
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002648static ssize_t rbd_remove(struct bus_type *bus,
2649 const char *buf,
2650 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002651{
2652 struct rbd_device *rbd_dev = NULL;
2653 int target_id, rc;
2654 unsigned long ul;
2655 int ret = count;
2656
2657 rc = strict_strtoul(buf, 10, &ul);
2658 if (rc)
2659 return rc;
2660
2661 /* convert to int; abort if we lost anything in the conversion */
2662 target_id = (int) ul;
2663 if (target_id != ul)
2664 return -EINVAL;
2665
2666 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2667
2668 rbd_dev = __rbd_get_dev(target_id);
2669 if (!rbd_dev) {
2670 ret = -ENOENT;
2671 goto done;
2672 }
2673
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002674 __rbd_remove_all_snaps(rbd_dev);
2675 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002676
2677done:
2678 mutex_unlock(&ctl_mutex);
2679 return ret;
2680}
2681
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002682static ssize_t rbd_snap_add(struct device *dev,
2683 struct device_attribute *attr,
2684 const char *buf,
2685 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002686{
Alex Elder593a9e72012-02-07 12:03:37 -06002687 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002688 int ret;
2689 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002690 if (!name)
2691 return -ENOMEM;
2692
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002693 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002694
2695 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2696
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002697 ret = rbd_header_add_snap(rbd_dev,
2698 name, GFP_KERNEL);
2699 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002700 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002701
Alex Elderb8136232012-07-25 09:32:41 -05002702 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002703 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002704 goto err_unlock;
2705
2706 /* shouldn't hold ctl_mutex when notifying.. notify might
2707 trigger a watch callback that would need to get that mutex */
2708 mutex_unlock(&ctl_mutex);
2709
2710 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002711 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002712
2713 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002714 kfree(name);
2715 return ret;
2716
2717err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002718 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002719 kfree(name);
2720 return ret;
2721}
2722
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002723/*
2724 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002725 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002726 */
2727static int rbd_sysfs_init(void)
2728{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002729 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002730
Alex Elderfed4c142012-02-07 12:03:36 -06002731 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002732 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002733 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002734
Alex Elderfed4c142012-02-07 12:03:36 -06002735 ret = bus_register(&rbd_bus_type);
2736 if (ret < 0)
2737 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002738
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002739 return ret;
2740}
2741
2742static void rbd_sysfs_cleanup(void)
2743{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002744 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002745 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002746}
2747
2748int __init rbd_init(void)
2749{
2750 int rc;
2751
2752 rc = rbd_sysfs_init();
2753 if (rc)
2754 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002755 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002756 return 0;
2757}
2758
2759void __exit rbd_exit(void)
2760{
2761 rbd_sysfs_cleanup();
2762}
2763
2764module_init(rbd_init);
2765module_exit(rbd_exit);
2766
2767MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2768MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2769MODULE_DESCRIPTION("rados block device");
2770
2771/* following authorship retained from original osdblk.c */
2772MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2773
2774MODULE_LICENSE("GPL");