blob: 812fd38cba3dce949142a6185cd1d45c04b355ee [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderf0f8cef2012-01-29 13:57:44 -060044#define RBD_DRV_NAME "rbd"
45#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070046
47#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
48
Alex Elder21079782012-01-24 10:08:36 -060049#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
Yehuda Sadeh602adf42010-08-12 16:11:25 -070050#define RBD_MAX_POOL_NAME_LEN 64
51#define RBD_MAX_SNAP_NAME_LEN 32
52#define RBD_MAX_OPT_LEN 1024
53
54#define RBD_SNAP_HEAD_NAME "-"
55
56#define DEV_NAME_LEN 32
57
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070058#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
59
Yehuda Sadeh602adf42010-08-12 16:11:25 -070060/*
61 * block device image metadata (in-memory version)
62 */
63struct rbd_image_header {
64 u64 image_size;
65 char block_name[32];
66 __u8 obj_order;
67 __u8 crypt_type;
68 __u8 comp_type;
69 struct rw_semaphore snap_rwsem;
70 struct ceph_snap_context *snapc;
71 size_t snap_names_len;
72 u64 snap_seq;
73 u32 total_snaps;
74
75 char *snap_names;
76 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070077
78 u64 obj_version;
79};
80
81struct rbd_options {
82 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083};
84
85/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060086 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070087 */
88struct rbd_client {
89 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070090 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091 struct kref kref;
92 struct list_head node;
93};
94
95/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060096 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -070098struct rbd_req_status {
99 int done;
100 int rc;
101 u64 bytes;
102};
103
104/*
105 * a collection of requests
106 */
107struct rbd_req_coll {
108 int total;
109 int num_done;
110 struct kref kref;
111 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112};
113
Alex Elderf0f8cef2012-01-29 13:57:44 -0600114/*
115 * a single io request
116 */
117struct rbd_request {
118 struct request *rq; /* blk layer request */
119 struct bio *bio; /* cloned bio */
120 struct page **pages; /* list of used pages */
121 u64 len;
122 int coll_index;
123 struct rbd_req_coll *coll;
124};
125
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800126struct rbd_snap {
127 struct device dev;
128 const char *name;
129 size_t size;
130 struct list_head node;
131 u64 id;
132};
133
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700134/*
135 * a single device
136 */
137struct rbd_device {
138 int id; /* blkdev unique id */
139
140 int major; /* blkdev assigned major */
141 struct gendisk *disk; /* blkdev's gendisk and rq */
142 struct request_queue *q;
143
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700144 struct rbd_client *rbd_client;
145
146 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
147
148 spinlock_t lock; /* queue lock */
149
150 struct rbd_image_header header;
151 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
152 int obj_len;
153 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
154 char pool_name[RBD_MAX_POOL_NAME_LEN];
155 int poolid;
156
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700157 struct ceph_osd_event *watch_event;
158 struct ceph_osd_request *watch_request;
159
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700160 char snap_name[RBD_MAX_SNAP_NAME_LEN];
161 u32 cur_snap; /* index+1 of current snapshot within snap context
162 0 - for the head */
163 int read_only;
164
165 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166
167 /* list of snapshots */
168 struct list_head snaps;
169
170 /* sysfs related */
171 struct device dev;
172};
173
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700174static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600175
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700176static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600177static DEFINE_SPINLOCK(rbd_dev_list_lock);
178
Alex Elder432b8582012-01-29 13:57:44 -0600179static LIST_HEAD(rbd_client_list); /* clients */
180static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800182static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
183static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800184static ssize_t rbd_snap_add(struct device *dev,
185 struct device_attribute *attr,
186 const char *buf,
187 size_t count);
188static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700189 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800190
Alex Elderf0f8cef2012-01-29 13:57:44 -0600191static ssize_t rbd_add(struct bus_type *bus, const char *buf,
192 size_t count);
193static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
194 size_t count);
195
196static struct bus_attribute rbd_bus_attrs[] = {
197 __ATTR(add, S_IWUSR, NULL, rbd_add),
198 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
199 __ATTR_NULL
200};
201
202static struct bus_type rbd_bus_type = {
203 .name = "rbd",
204 .bus_attrs = rbd_bus_attrs,
205};
206
207static void rbd_root_dev_release(struct device *dev)
208{
209}
210
211static struct device rbd_root_dev = {
212 .init_name = "rbd",
213 .release = rbd_root_dev_release,
214};
215
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800216
217static struct rbd_device *dev_to_rbd(struct device *dev)
218{
219 return container_of(dev, struct rbd_device, dev);
220}
221
222static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
223{
224 return get_device(&rbd_dev->dev);
225}
226
227static void rbd_put_dev(struct rbd_device *rbd_dev)
228{
229 put_device(&rbd_dev->dev);
230}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700231
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700232static int __rbd_update_snaps(struct rbd_device *rbd_dev);
233
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700234static int rbd_open(struct block_device *bdev, fmode_t mode)
235{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600236 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700237
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800238 rbd_get_dev(rbd_dev);
239
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700240 set_device_ro(bdev, rbd_dev->read_only);
241
242 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
243 return -EROFS;
244
245 return 0;
246}
247
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800248static int rbd_release(struct gendisk *disk, fmode_t mode)
249{
250 struct rbd_device *rbd_dev = disk->private_data;
251
252 rbd_put_dev(rbd_dev);
253
254 return 0;
255}
256
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700257static const struct block_device_operations rbd_bd_ops = {
258 .owner = THIS_MODULE,
259 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800260 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700261};
262
263/*
264 * Initialize an rbd client instance.
265 * We own *opt.
266 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700267static struct rbd_client *rbd_client_create(struct ceph_options *opt,
268 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269{
270 struct rbd_client *rbdc;
271 int ret = -ENOMEM;
272
273 dout("rbd_client_create\n");
274 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
275 if (!rbdc)
276 goto out_opt;
277
278 kref_init(&rbdc->kref);
279 INIT_LIST_HEAD(&rbdc->node);
280
Alex Elderbc534d82012-01-29 13:57:44 -0600281 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
282
Sage Weil6ab00d42011-08-09 09:41:59 -0700283 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600285 goto out_mutex;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400286 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700287
288 ret = ceph_open_session(rbdc->client);
289 if (ret < 0)
290 goto out_err;
291
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700292 rbdc->rbd_opts = rbd_opts;
293
Alex Elder432b8582012-01-29 13:57:44 -0600294 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600296 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
Alex Elderbc534d82012-01-29 13:57:44 -0600298 mutex_unlock(&ctl_mutex);
299
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700300 dout("rbd_client_create created %p\n", rbdc);
301 return rbdc;
302
303out_err:
304 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600305out_mutex:
306 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307 kfree(rbdc);
308out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400309 if (opt)
310 ceph_destroy_options(opt);
311 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700312}
313
314/*
315 * Find a ceph client with specific addr and configuration.
316 */
317static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
318{
319 struct rbd_client *client_node;
320
321 if (opt->flags & CEPH_OPT_NOSHARE)
322 return NULL;
323
324 list_for_each_entry(client_node, &rbd_client_list, node)
325 if (ceph_compare_options(opt, client_node->client) == 0)
326 return client_node;
327 return NULL;
328}
329
330/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700331 * mount options
332 */
333enum {
334 Opt_notify_timeout,
335 Opt_last_int,
336 /* int args above */
337 Opt_last_string,
338 /* string args above */
339};
340
341static match_table_t rbdopt_tokens = {
342 {Opt_notify_timeout, "notify_timeout=%d"},
343 /* int args above */
344 /* string args above */
345 {-1, NULL}
346};
347
348static int parse_rbd_opts_token(char *c, void *private)
349{
350 struct rbd_options *rbdopt = private;
351 substring_t argstr[MAX_OPT_ARGS];
352 int token, intval, ret;
353
Alex Elder21079782012-01-24 10:08:36 -0600354 token = match_token(c, rbdopt_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700355 if (token < 0)
356 return -EINVAL;
357
358 if (token < Opt_last_int) {
359 ret = match_int(&argstr[0], &intval);
360 if (ret < 0) {
361 pr_err("bad mount option arg (not int) "
362 "at '%s'\n", c);
363 return ret;
364 }
365 dout("got int token %d val %d\n", token, intval);
366 } else if (token > Opt_last_int && token < Opt_last_string) {
367 dout("got string token %d val %s\n", token,
368 argstr[0].from);
369 } else {
370 dout("got token %d\n", token);
371 }
372
373 switch (token) {
374 case Opt_notify_timeout:
375 rbdopt->notify_timeout = intval;
376 break;
377 default:
378 BUG_ON(token);
379 }
380 return 0;
381}
382
383/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700384 * Get a ceph client with specific addr and configuration, if one does
385 * not exist create it.
386 */
387static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
388 char *options)
389{
390 struct rbd_client *rbdc;
391 struct ceph_options *opt;
392 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700393 struct rbd_options *rbd_opts;
394
395 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
396 if (!rbd_opts)
397 return -ENOMEM;
398
399 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700400
Alex Elderee577412012-01-24 10:08:36 -0600401 opt = ceph_parse_options(options, mon_addr,
Alex Elder21079782012-01-24 10:08:36 -0600402 mon_addr + strlen(mon_addr),
403 parse_rbd_opts_token, rbd_opts);
Alex Elderee577412012-01-24 10:08:36 -0600404 if (IS_ERR(opt)) {
405 ret = PTR_ERR(opt);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700406 goto done_err;
Alex Elderee577412012-01-24 10:08:36 -0600407 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700408
Alex Elder432b8582012-01-29 13:57:44 -0600409 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410 rbdc = __rbd_client_find(opt);
411 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600412 /* using an existing client */
413 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600414 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d3d2012-01-29 13:57:44 -0600415
416 rbd_dev->rbd_client = rbdc;
417
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700418 ceph_destroy_options(opt);
Alex Elder97bb59a2012-01-24 10:08:36 -0600419 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700420
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421 return 0;
422 }
Alex Elder432b8582012-01-29 13:57:44 -0600423 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700424
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700425 rbdc = rbd_client_create(opt, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600426
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700427 if (IS_ERR(rbdc)) {
428 ret = PTR_ERR(rbdc);
429 goto done_err;
430 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431
432 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433 return 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434done_err:
435 kfree(rbd_opts);
436 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437}
438
439/*
440 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600441 *
Alex Elder432b8582012-01-29 13:57:44 -0600442 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443 */
444static void rbd_client_release(struct kref *kref)
445{
446 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
447
448 dout("rbd_release_client %p\n", rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700449 list_del(&rbdc->node);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450
451 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700452 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 kfree(rbdc);
454}
455
456/*
457 * Drop reference to ceph client node. If it's not referenced anymore, release
458 * it.
459 */
460static void rbd_put_client(struct rbd_device *rbd_dev)
461{
Alex Elder432b8582012-01-29 13:57:44 -0600462 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
Alex Elder432b8582012-01-29 13:57:44 -0600464 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700466}
467
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700468/*
469 * Destroy requests collection
470 */
471static void rbd_coll_release(struct kref *kref)
472{
473 struct rbd_req_coll *coll =
474 container_of(kref, struct rbd_req_coll, kref);
475
476 dout("rbd_coll_release %p\n", coll);
477 kfree(coll);
478}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700479
480/*
481 * Create a new header structure, translate header format from the on-disk
482 * header.
483 */
484static int rbd_header_from_disk(struct rbd_image_header *header,
485 struct rbd_image_header_ondisk *ondisk,
486 int allocated_snaps,
487 gfp_t gfp_flags)
488{
489 int i;
490 u32 snap_count = le32_to_cpu(ondisk->snap_count);
491 int ret = -ENOMEM;
492
Alex Elder21079782012-01-24 10:08:36 -0600493 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800494 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800495
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700496 init_rwsem(&header->snap_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700497 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
498 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Alex Elder21079782012-01-24 10:08:36 -0600499 snap_count * sizeof (*ondisk),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700500 gfp_flags);
501 if (!header->snapc)
502 return -ENOMEM;
503 if (snap_count) {
504 header->snap_names = kmalloc(header->snap_names_len,
505 GFP_KERNEL);
506 if (!header->snap_names)
507 goto err_snapc;
508 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
509 GFP_KERNEL);
510 if (!header->snap_sizes)
511 goto err_names;
512 } else {
513 header->snap_names = NULL;
514 header->snap_sizes = NULL;
515 }
516 memcpy(header->block_name, ondisk->block_name,
517 sizeof(ondisk->block_name));
518
519 header->image_size = le64_to_cpu(ondisk->image_size);
520 header->obj_order = ondisk->options.order;
521 header->crypt_type = ondisk->options.crypt_type;
522 header->comp_type = ondisk->options.comp_type;
523
524 atomic_set(&header->snapc->nref, 1);
525 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
526 header->snapc->num_snaps = snap_count;
527 header->total_snaps = snap_count;
528
Alex Elder21079782012-01-24 10:08:36 -0600529 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700530 for (i = 0; i < snap_count; i++) {
531 header->snapc->snaps[i] =
532 le64_to_cpu(ondisk->snaps[i].id);
533 header->snap_sizes[i] =
534 le64_to_cpu(ondisk->snaps[i].image_size);
535 }
536
537 /* copy snapshot names */
538 memcpy(header->snap_names, &ondisk->snaps[i],
539 header->snap_names_len);
540 }
541
542 return 0;
543
544err_names:
545 kfree(header->snap_names);
546err_snapc:
547 kfree(header->snapc);
548 return ret;
549}
550
551static int snap_index(struct rbd_image_header *header, int snap_num)
552{
553 return header->total_snaps - snap_num;
554}
555
556static u64 cur_snap_id(struct rbd_device *rbd_dev)
557{
558 struct rbd_image_header *header = &rbd_dev->header;
559
560 if (!rbd_dev->cur_snap)
561 return 0;
562
563 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
564}
565
566static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
567 u64 *seq, u64 *size)
568{
569 int i;
570 char *p = header->snap_names;
571
572 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
573 if (strcmp(snap_name, p) == 0)
574 break;
575 }
576 if (i == header->total_snaps)
577 return -ENOENT;
578 if (seq)
579 *seq = header->snapc->snaps[i];
580
581 if (size)
582 *size = header->snap_sizes[i];
583
584 return i;
585}
586
Josh Durgincc9d7342011-11-21 18:19:13 -0800587static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700588{
589 struct rbd_image_header *header = &dev->header;
590 struct ceph_snap_context *snapc = header->snapc;
591 int ret = -ENOENT;
592
Josh Durgincc9d7342011-11-21 18:19:13 -0800593 BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
594
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700595 down_write(&header->snap_rwsem);
596
Josh Durgincc9d7342011-11-21 18:19:13 -0800597 if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
598 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599 if (header->total_snaps)
600 snapc->seq = header->snap_seq;
601 else
602 snapc->seq = 0;
603 dev->cur_snap = 0;
604 dev->read_only = 0;
605 if (size)
606 *size = header->image_size;
607 } else {
Josh Durgincc9d7342011-11-21 18:19:13 -0800608 ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609 if (ret < 0)
610 goto done;
611
612 dev->cur_snap = header->total_snaps - ret;
613 dev->read_only = 1;
614 }
615
616 ret = 0;
617done:
618 up_write(&header->snap_rwsem);
619 return ret;
620}
621
622static void rbd_header_free(struct rbd_image_header *header)
623{
624 kfree(header->snapc);
625 kfree(header->snap_names);
626 kfree(header->snap_sizes);
627}
628
629/*
630 * get the actual striped segment name, offset and length
631 */
632static u64 rbd_get_segment(struct rbd_image_header *header,
633 const char *block_name,
634 u64 ofs, u64 len,
635 char *seg_name, u64 *segofs)
636{
637 u64 seg = ofs >> header->obj_order;
638
639 if (seg_name)
640 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
641 "%s.%012llx", block_name, seg);
642
643 ofs = ofs & ((1 << header->obj_order) - 1);
644 len = min_t(u64, len, (1 << header->obj_order) - ofs);
645
646 if (segofs)
647 *segofs = ofs;
648
649 return len;
650}
651
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700652static int rbd_get_num_segments(struct rbd_image_header *header,
653 u64 ofs, u64 len)
654{
655 u64 start_seg = ofs >> header->obj_order;
656 u64 end_seg = (ofs + len - 1) >> header->obj_order;
657 return end_seg - start_seg + 1;
658}
659
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700661 * returns the size of an object in the image
662 */
663static u64 rbd_obj_bytes(struct rbd_image_header *header)
664{
665 return 1 << header->obj_order;
666}
667
668/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 * bio helpers
670 */
671
672static void bio_chain_put(struct bio *chain)
673{
674 struct bio *tmp;
675
676 while (chain) {
677 tmp = chain;
678 chain = chain->bi_next;
679 bio_put(tmp);
680 }
681}
682
683/*
684 * zeros a bio chain, starting at specific offset
685 */
686static void zero_bio_chain(struct bio *chain, int start_ofs)
687{
688 struct bio_vec *bv;
689 unsigned long flags;
690 void *buf;
691 int i;
692 int pos = 0;
693
694 while (chain) {
695 bio_for_each_segment(bv, chain, i) {
696 if (pos + bv->bv_len > start_ofs) {
697 int remainder = max(start_ofs - pos, 0);
698 buf = bvec_kmap_irq(bv, &flags);
699 memset(buf + remainder, 0,
700 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200701 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702 }
703 pos += bv->bv_len;
704 }
705
706 chain = chain->bi_next;
707 }
708}
709
710/*
711 * bio_chain_clone - clone a chain of bios up to a certain length.
712 * might return a bio_pair that will need to be released.
713 */
714static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
715 struct bio_pair **bp,
716 int len, gfp_t gfpmask)
717{
718 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
719 int total = 0;
720
721 if (*bp) {
722 bio_pair_release(*bp);
723 *bp = NULL;
724 }
725
726 while (old_chain && (total < len)) {
727 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
728 if (!tmp)
729 goto err_out;
730
731 if (total + old_chain->bi_size > len) {
732 struct bio_pair *bp;
733
734 /*
735 * this split can only happen with a single paged bio,
736 * split_bio will BUG_ON if this is not the case
737 */
738 dout("bio_chain_clone split! total=%d remaining=%d"
739 "bi_size=%d\n",
740 (int)total, (int)len-total,
741 (int)old_chain->bi_size);
742
743 /* split the bio. We'll release it either in the next
744 call, or it will have to be released outside */
745 bp = bio_split(old_chain, (len - total) / 512ULL);
746 if (!bp)
747 goto err_out;
748
749 __bio_clone(tmp, &bp->bio1);
750
751 *next = &bp->bio2;
752 } else {
753 __bio_clone(tmp, old_chain);
754 *next = old_chain->bi_next;
755 }
756
757 tmp->bi_bdev = NULL;
758 gfpmask &= ~__GFP_WAIT;
759 tmp->bi_next = NULL;
760
761 if (!new_chain) {
762 new_chain = tail = tmp;
763 } else {
764 tail->bi_next = tmp;
765 tail = tmp;
766 }
767 old_chain = old_chain->bi_next;
768
769 total += tmp->bi_size;
770 }
771
772 BUG_ON(total < len);
773
774 if (tail)
775 tail->bi_next = NULL;
776
777 *old = old_chain;
778
779 return new_chain;
780
781err_out:
782 dout("bio_chain_clone with err\n");
783 bio_chain_put(new_chain);
784 return NULL;
785}
786
787/*
788 * helpers for osd request op vectors.
789 */
790static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
791 int num_ops,
792 int opcode,
793 u32 payload_len)
794{
795 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
796 GFP_NOIO);
797 if (!*ops)
798 return -ENOMEM;
799 (*ops)[0].op = opcode;
800 /*
801 * op extent offset and length will be set later on
802 * in calc_raw_layout()
803 */
804 (*ops)[0].payload_len = payload_len;
805 return 0;
806}
807
808static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
809{
810 kfree(ops);
811}
812
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700813static void rbd_coll_end_req_index(struct request *rq,
814 struct rbd_req_coll *coll,
815 int index,
816 int ret, u64 len)
817{
818 struct request_queue *q;
819 int min, max, i;
820
821 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
822 coll, index, ret, len);
823
824 if (!rq)
825 return;
826
827 if (!coll) {
828 blk_end_request(rq, ret, len);
829 return;
830 }
831
832 q = rq->q;
833
834 spin_lock_irq(q->queue_lock);
835 coll->status[index].done = 1;
836 coll->status[index].rc = ret;
837 coll->status[index].bytes = len;
838 max = min = coll->num_done;
839 while (max < coll->total && coll->status[max].done)
840 max++;
841
842 for (i = min; i<max; i++) {
843 __blk_end_request(rq, coll->status[i].rc,
844 coll->status[i].bytes);
845 coll->num_done++;
846 kref_put(&coll->kref, rbd_coll_release);
847 }
848 spin_unlock_irq(q->queue_lock);
849}
850
851static void rbd_coll_end_req(struct rbd_request *req,
852 int ret, u64 len)
853{
854 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
855}
856
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700857/*
858 * Send ceph osd request
859 */
860static int rbd_do_request(struct request *rq,
861 struct rbd_device *dev,
862 struct ceph_snap_context *snapc,
863 u64 snapid,
864 const char *obj, u64 ofs, u64 len,
865 struct bio *bio,
866 struct page **pages,
867 int num_pages,
868 int flags,
869 struct ceph_osd_req_op *ops,
870 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700871 struct rbd_req_coll *coll,
872 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700873 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700874 struct ceph_msg *msg),
875 struct ceph_osd_request **linger_req,
876 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700877{
878 struct ceph_osd_request *req;
879 struct ceph_file_layout *layout;
880 int ret;
881 u64 bno;
882 struct timespec mtime = CURRENT_TIME;
883 struct rbd_request *req_data;
884 struct ceph_osd_request_head *reqhead;
885 struct rbd_image_header *header = &dev->header;
Alex Elder1dbb4392012-01-24 10:08:37 -0600886 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700889 if (!req_data) {
890 if (coll)
891 rbd_coll_end_req_index(rq, coll, coll_index,
892 -ENOMEM, len);
893 return -ENOMEM;
894 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700895
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700896 if (coll) {
897 req_data->coll = coll;
898 req_data->coll_index = coll_index;
899 }
900
901 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700902
903 down_read(&header->snap_rwsem);
904
Alex Elder1dbb4392012-01-24 10:08:37 -0600905 osdc = &dev->rbd_client->client->osdc;
906 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
907 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700908 if (!req) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700909 up_read(&header->snap_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700910 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911 goto done_pages;
912 }
913
914 req->r_callback = rbd_cb;
915
916 req_data->rq = rq;
917 req_data->bio = bio;
918 req_data->pages = pages;
919 req_data->len = len;
920
921 req->r_priv = req_data;
922
923 reqhead = req->r_request->front.iov_base;
924 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
925
926 strncpy(req->r_oid, obj, sizeof(req->r_oid));
927 req->r_oid_len = strlen(req->r_oid);
928
929 layout = &req->r_file_layout;
930 memset(layout, 0, sizeof(*layout));
931 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
932 layout->fl_stripe_count = cpu_to_le32(1);
933 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
934 layout->fl_pg_preferred = cpu_to_le32(-1);
935 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
Alex Elder1dbb4392012-01-24 10:08:37 -0600936 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
937 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700938
939 ceph_osdc_build_request(req, ofs, &len,
940 ops,
941 snapc,
942 &mtime,
943 req->r_oid, req->r_oid_len);
944 up_read(&header->snap_rwsem);
945
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700946 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600947 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700948 *linger_req = req;
949 }
950
Alex Elder1dbb4392012-01-24 10:08:37 -0600951 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700952 if (ret < 0)
953 goto done_err;
954
955 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600956 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700957 if (ver)
958 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700959 dout("reassert_ver=%lld\n",
960 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700961 ceph_osdc_put_request(req);
962 }
963 return ret;
964
965done_err:
966 bio_chain_put(req_data->bio);
967 ceph_osdc_put_request(req);
968done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700969 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700970 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700971 return ret;
972}
973
974/*
975 * Ceph osd op callback
976 */
977static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
978{
979 struct rbd_request *req_data = req->r_priv;
980 struct ceph_osd_reply_head *replyhead;
981 struct ceph_osd_op *op;
982 __s32 rc;
983 u64 bytes;
984 int read_op;
985
986 /* parse reply */
987 replyhead = msg->front.iov_base;
988 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
989 op = (void *)(replyhead + 1);
990 rc = le32_to_cpu(replyhead->result);
991 bytes = le64_to_cpu(op->extent.length);
992 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
993
994 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
995
996 if (rc == -ENOENT && read_op) {
997 zero_bio_chain(req_data->bio, 0);
998 rc = 0;
999 } else if (rc == 0 && read_op && bytes < req_data->len) {
1000 zero_bio_chain(req_data->bio, bytes);
1001 bytes = req_data->len;
1002 }
1003
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001004 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005
1006 if (req_data->bio)
1007 bio_chain_put(req_data->bio);
1008
1009 ceph_osdc_put_request(req);
1010 kfree(req_data);
1011}
1012
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001013static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1014{
1015 ceph_osdc_put_request(req);
1016}
1017
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001018/*
1019 * Do a synchronous ceph osd operation
1020 */
1021static int rbd_req_sync_op(struct rbd_device *dev,
1022 struct ceph_snap_context *snapc,
1023 u64 snapid,
1024 int opcode,
1025 int flags,
1026 struct ceph_osd_req_op *orig_ops,
1027 int num_reply,
1028 const char *obj,
1029 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001030 char *buf,
1031 struct ceph_osd_request **linger_req,
1032 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001033{
1034 int ret;
1035 struct page **pages;
1036 int num_pages;
1037 struct ceph_osd_req_op *ops = orig_ops;
1038 u32 payload_len;
1039
1040 num_pages = calc_pages_for(ofs , len);
1041 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001042 if (IS_ERR(pages))
1043 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001044
1045 if (!orig_ops) {
1046 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1047 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1048 if (ret < 0)
1049 goto done;
1050
1051 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1052 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1053 if (ret < 0)
1054 goto done_ops;
1055 }
1056 }
1057
1058 ret = rbd_do_request(NULL, dev, snapc, snapid,
1059 obj, ofs, len, NULL,
1060 pages, num_pages,
1061 flags,
1062 ops,
1063 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001064 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001065 NULL,
1066 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001067 if (ret < 0)
1068 goto done_ops;
1069
1070 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1071 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1072
1073done_ops:
1074 if (!orig_ops)
1075 rbd_destroy_ops(ops);
1076done:
1077 ceph_release_page_vector(pages, num_pages);
1078 return ret;
1079}
1080
1081/*
1082 * Do an asynchronous ceph osd operation
1083 */
1084static int rbd_do_op(struct request *rq,
1085 struct rbd_device *rbd_dev ,
1086 struct ceph_snap_context *snapc,
1087 u64 snapid,
1088 int opcode, int flags, int num_reply,
1089 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001090 struct bio *bio,
1091 struct rbd_req_coll *coll,
1092 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001093{
1094 char *seg_name;
1095 u64 seg_ofs;
1096 u64 seg_len;
1097 int ret;
1098 struct ceph_osd_req_op *ops;
1099 u32 payload_len;
1100
1101 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1102 if (!seg_name)
1103 return -ENOMEM;
1104
1105 seg_len = rbd_get_segment(&rbd_dev->header,
1106 rbd_dev->header.block_name,
1107 ofs, len,
1108 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109
1110 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1111
1112 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1113 if (ret < 0)
1114 goto done;
1115
1116 /* we've taken care of segment sizes earlier when we
1117 cloned the bios. We should never have a segment
1118 truncated at this point */
1119 BUG_ON(seg_len < len);
1120
1121 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1122 seg_name, seg_ofs, seg_len,
1123 bio,
1124 NULL, 0,
1125 flags,
1126 ops,
1127 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001128 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001129 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001130
1131 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001132done:
1133 kfree(seg_name);
1134 return ret;
1135}
1136
1137/*
1138 * Request async osd write
1139 */
1140static int rbd_req_write(struct request *rq,
1141 struct rbd_device *rbd_dev,
1142 struct ceph_snap_context *snapc,
1143 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001144 struct bio *bio,
1145 struct rbd_req_coll *coll,
1146 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001147{
1148 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1149 CEPH_OSD_OP_WRITE,
1150 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1151 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001152 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153}
1154
1155/*
1156 * Request async osd read
1157 */
1158static int rbd_req_read(struct request *rq,
1159 struct rbd_device *rbd_dev,
1160 u64 snapid,
1161 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001162 struct bio *bio,
1163 struct rbd_req_coll *coll,
1164 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165{
1166 return rbd_do_op(rq, rbd_dev, NULL,
1167 (snapid ? snapid : CEPH_NOSNAP),
1168 CEPH_OSD_OP_READ,
1169 CEPH_OSD_FLAG_READ,
1170 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001171 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172}
1173
1174/*
1175 * Request sync osd read
1176 */
1177static int rbd_req_sync_read(struct rbd_device *dev,
1178 struct ceph_snap_context *snapc,
1179 u64 snapid,
1180 const char *obj,
1181 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001182 char *buf,
1183 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001184{
1185 return rbd_req_sync_op(dev, NULL,
1186 (snapid ? snapid : CEPH_NOSNAP),
1187 CEPH_OSD_OP_READ,
1188 CEPH_OSD_FLAG_READ,
1189 NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001190 1, obj, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001191}
1192
1193/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001194 * Request sync osd watch
1195 */
1196static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1197 u64 ver,
1198 u64 notify_id,
1199 const char *obj)
1200{
1201 struct ceph_osd_req_op *ops;
1202 struct page **pages = NULL;
Sage Weil11f77002011-05-12 16:13:54 -07001203 int ret;
1204
1205 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001206 if (ret < 0)
1207 return ret;
1208
1209 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1210 ops[0].watch.cookie = notify_id;
1211 ops[0].watch.flag = 0;
1212
1213 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1214 obj, 0, 0, NULL,
1215 pages, 0,
1216 CEPH_OSD_FLAG_READ,
1217 ops,
1218 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001219 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001220 rbd_simple_req_cb, 0, NULL);
1221
1222 rbd_destroy_ops(ops);
1223 return ret;
1224}
1225
1226static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1227{
1228 struct rbd_device *dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001229 int rc;
1230
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001231 if (!dev)
1232 return;
1233
1234 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1235 notify_id, (int)opcode);
1236 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Sage Weil13143d22011-05-12 16:08:30 -07001237 rc = __rbd_update_snaps(dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001238 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001239 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001240 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
1241 " update snaps: %d\n", dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001242
1243 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1244}
1245
1246/*
1247 * Request sync osd watch
1248 */
1249static int rbd_req_sync_watch(struct rbd_device *dev,
1250 const char *obj,
1251 u64 ver)
1252{
1253 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001254 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001255
1256 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1257 if (ret < 0)
1258 return ret;
1259
1260 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1261 (void *)dev, &dev->watch_event);
1262 if (ret < 0)
1263 goto fail;
1264
1265 ops[0].watch.ver = cpu_to_le64(ver);
1266 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1267 ops[0].watch.flag = 1;
1268
1269 ret = rbd_req_sync_op(dev, NULL,
1270 CEPH_NOSNAP,
1271 0,
1272 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1273 ops,
1274 1, obj, 0, 0, NULL,
1275 &dev->watch_request, NULL);
1276
1277 if (ret < 0)
1278 goto fail_event;
1279
1280 rbd_destroy_ops(ops);
1281 return 0;
1282
1283fail_event:
1284 ceph_osdc_cancel_event(dev->watch_event);
1285 dev->watch_event = NULL;
1286fail:
1287 rbd_destroy_ops(ops);
1288 return ret;
1289}
1290
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001291/*
1292 * Request sync osd unwatch
1293 */
1294static int rbd_req_sync_unwatch(struct rbd_device *dev,
1295 const char *obj)
1296{
1297 struct ceph_osd_req_op *ops;
1298
1299 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1300 if (ret < 0)
1301 return ret;
1302
1303 ops[0].watch.ver = 0;
1304 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1305 ops[0].watch.flag = 0;
1306
1307 ret = rbd_req_sync_op(dev, NULL,
1308 CEPH_NOSNAP,
1309 0,
1310 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1311 ops,
1312 1, obj, 0, 0, NULL, NULL, NULL);
1313
1314 rbd_destroy_ops(ops);
1315 ceph_osdc_cancel_event(dev->watch_event);
1316 dev->watch_event = NULL;
1317 return ret;
1318}
1319
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001320struct rbd_notify_info {
1321 struct rbd_device *dev;
1322};
1323
1324static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1325{
1326 struct rbd_device *dev = (struct rbd_device *)data;
1327 if (!dev)
1328 return;
1329
1330 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1331 notify_id, (int)opcode);
1332}
1333
1334/*
1335 * Request sync osd notify
1336 */
1337static int rbd_req_sync_notify(struct rbd_device *dev,
1338 const char *obj)
1339{
1340 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001341 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001342 struct ceph_osd_event *event;
1343 struct rbd_notify_info info;
1344 int payload_len = sizeof(u32) + sizeof(u32);
1345 int ret;
1346
1347 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1348 if (ret < 0)
1349 return ret;
1350
1351 info.dev = dev;
1352
1353 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1354 (void *)&info, &event);
1355 if (ret < 0)
1356 goto fail;
1357
1358 ops[0].watch.ver = 1;
1359 ops[0].watch.flag = 1;
1360 ops[0].watch.cookie = event->cookie;
1361 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1362 ops[0].watch.timeout = 12;
1363
1364 ret = rbd_req_sync_op(dev, NULL,
1365 CEPH_NOSNAP,
1366 0,
1367 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1368 ops,
1369 1, obj, 0, 0, NULL, NULL, NULL);
1370 if (ret < 0)
1371 goto fail_event;
1372
1373 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1374 dout("ceph_osdc_wait_event returned %d\n", ret);
1375 rbd_destroy_ops(ops);
1376 return 0;
1377
1378fail_event:
1379 ceph_osdc_cancel_event(event);
1380fail:
1381 rbd_destroy_ops(ops);
1382 return ret;
1383}
1384
1385/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001386 * Request sync osd read
1387 */
1388static int rbd_req_sync_exec(struct rbd_device *dev,
1389 const char *obj,
1390 const char *cls,
1391 const char *method,
1392 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001393 int len,
1394 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001395{
1396 struct ceph_osd_req_op *ops;
1397 int cls_len = strlen(cls);
1398 int method_len = strlen(method);
1399 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1400 cls_len + method_len + len);
1401 if (ret < 0)
1402 return ret;
1403
1404 ops[0].cls.class_name = cls;
1405 ops[0].cls.class_len = (__u8)cls_len;
1406 ops[0].cls.method_name = method;
1407 ops[0].cls.method_len = (__u8)method_len;
1408 ops[0].cls.argc = 0;
1409 ops[0].cls.indata = data;
1410 ops[0].cls.indata_len = len;
1411
1412 ret = rbd_req_sync_op(dev, NULL,
1413 CEPH_NOSNAP,
1414 0,
1415 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1416 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001417 1, obj, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001418
1419 rbd_destroy_ops(ops);
1420
1421 dout("cls_exec returned %d\n", ret);
1422 return ret;
1423}
1424
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001425static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1426{
1427 struct rbd_req_coll *coll =
1428 kzalloc(sizeof(struct rbd_req_coll) +
1429 sizeof(struct rbd_req_status) * num_reqs,
1430 GFP_ATOMIC);
1431
1432 if (!coll)
1433 return NULL;
1434 coll->total = num_reqs;
1435 kref_init(&coll->kref);
1436 return coll;
1437}
1438
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001439/*
1440 * block device queue callback
1441 */
1442static void rbd_rq_fn(struct request_queue *q)
1443{
1444 struct rbd_device *rbd_dev = q->queuedata;
1445 struct request *rq;
1446 struct bio_pair *bp = NULL;
1447
1448 rq = blk_fetch_request(q);
1449
1450 while (1) {
1451 struct bio *bio;
1452 struct bio *rq_bio, *next_bio = NULL;
1453 bool do_write;
1454 int size, op_size = 0;
1455 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001456 int num_segs, cur_seg = 0;
1457 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001458
1459 /* peek at request from block layer */
1460 if (!rq)
1461 break;
1462
1463 dout("fetched request\n");
1464
1465 /* filter out block requests we don't understand */
1466 if ((rq->cmd_type != REQ_TYPE_FS)) {
1467 __blk_end_request_all(rq, 0);
1468 goto next;
1469 }
1470
1471 /* deduce our operation (read, write) */
1472 do_write = (rq_data_dir(rq) == WRITE);
1473
1474 size = blk_rq_bytes(rq);
1475 ofs = blk_rq_pos(rq) * 512ULL;
1476 rq_bio = rq->bio;
1477 if (do_write && rbd_dev->read_only) {
1478 __blk_end_request_all(rq, -EROFS);
1479 goto next;
1480 }
1481
1482 spin_unlock_irq(q->queue_lock);
1483
1484 dout("%s 0x%x bytes at 0x%llx\n",
1485 do_write ? "write" : "read",
1486 size, blk_rq_pos(rq) * 512ULL);
1487
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001488 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1489 coll = rbd_alloc_coll(num_segs);
1490 if (!coll) {
1491 spin_lock_irq(q->queue_lock);
1492 __blk_end_request_all(rq, -ENOMEM);
1493 goto next;
1494 }
1495
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001496 do {
1497 /* a bio clone to be passed down to OSD req */
1498 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1499 op_size = rbd_get_segment(&rbd_dev->header,
1500 rbd_dev->header.block_name,
1501 ofs, size,
1502 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001503 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001504 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1505 op_size, GFP_ATOMIC);
1506 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001507 rbd_coll_end_req_index(rq, coll, cur_seg,
1508 -ENOMEM, op_size);
1509 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001510 }
1511
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001512
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513 /* init OSD command: write or read */
1514 if (do_write)
1515 rbd_req_write(rq, rbd_dev,
1516 rbd_dev->header.snapc,
1517 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001518 op_size, bio,
1519 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520 else
1521 rbd_req_read(rq, rbd_dev,
1522 cur_snap_id(rbd_dev),
1523 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001524 op_size, bio,
1525 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001526
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001527next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528 size -= op_size;
1529 ofs += op_size;
1530
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001531 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001532 rq_bio = next_bio;
1533 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001534 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535
1536 if (bp)
1537 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001538 spin_lock_irq(q->queue_lock);
1539next:
1540 rq = blk_fetch_request(q);
1541 }
1542}
1543
1544/*
1545 * a queue callback. Makes sure that we don't create a bio that spans across
1546 * multiple osd objects. One exception would be with a single page bios,
1547 * which we handle later at bio_chain_clone
1548 */
1549static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1550 struct bio_vec *bvec)
1551{
1552 struct rbd_device *rbd_dev = q->queuedata;
1553 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1554 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1555 unsigned int bio_sectors = bmd->bi_size >> 9;
1556 int max;
1557
1558 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1559 + bio_sectors)) << 9;
1560 if (max < 0)
1561 max = 0; /* bio_add cannot handle a negative return */
1562 if (max <= bvec->bv_len && bio_sectors == 0)
1563 return bvec->bv_len;
1564 return max;
1565}
1566
1567static void rbd_free_disk(struct rbd_device *rbd_dev)
1568{
1569 struct gendisk *disk = rbd_dev->disk;
1570
1571 if (!disk)
1572 return;
1573
1574 rbd_header_free(&rbd_dev->header);
1575
1576 if (disk->flags & GENHD_FL_UP)
1577 del_gendisk(disk);
1578 if (disk->queue)
1579 blk_cleanup_queue(disk->queue);
1580 put_disk(disk);
1581}
1582
1583/*
1584 * reload the ondisk the header
1585 */
1586static int rbd_read_header(struct rbd_device *rbd_dev,
1587 struct rbd_image_header *header)
1588{
1589 ssize_t rc;
1590 struct rbd_image_header_ondisk *dh;
1591 int snap_count = 0;
1592 u64 snap_names_len = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001593 u64 ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001594
1595 while (1) {
1596 int len = sizeof(*dh) +
1597 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1598 snap_names_len;
1599
1600 rc = -ENOMEM;
1601 dh = kmalloc(len, GFP_KERNEL);
1602 if (!dh)
1603 return -ENOMEM;
1604
1605 rc = rbd_req_sync_read(rbd_dev,
1606 NULL, CEPH_NOSNAP,
1607 rbd_dev->obj_md_name,
1608 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001609 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001610 if (rc < 0)
1611 goto out_dh;
1612
1613 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001614 if (rc < 0) {
1615 if (rc == -ENXIO) {
1616 pr_warning("unrecognized header format"
1617 " for image %s", rbd_dev->obj);
1618 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001619 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001620 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001621
1622 if (snap_count != header->total_snaps) {
1623 snap_count = header->total_snaps;
1624 snap_names_len = header->snap_names_len;
1625 rbd_header_free(header);
1626 kfree(dh);
1627 continue;
1628 }
1629 break;
1630 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001631 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001632
1633out_dh:
1634 kfree(dh);
1635 return rc;
1636}
1637
1638/*
1639 * create a snapshot
1640 */
1641static int rbd_header_add_snap(struct rbd_device *dev,
1642 const char *snap_name,
1643 gfp_t gfp_flags)
1644{
1645 int name_len = strlen(snap_name);
1646 u64 new_snapid;
1647 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001648 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001649 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001650 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001651
1652 /* we should create a snapshot only if we're pointing at the head */
1653 if (dev->cur_snap)
1654 return -EINVAL;
1655
Alex Elder1dbb4392012-01-24 10:08:37 -06001656 monc = &dev->rbd_client->client->monc;
1657 ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001658 dout("created snapid=%lld\n", new_snapid);
1659 if (ret < 0)
1660 return ret;
1661
1662 data = kmalloc(name_len + 16, gfp_flags);
1663 if (!data)
1664 return -ENOMEM;
1665
Sage Weil916d4d62011-05-12 16:10:50 -07001666 p = data;
1667 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001668
Sage Weil916d4d62011-05-12 16:10:50 -07001669 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1670 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001671
1672 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001673 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001674
Sage Weil916d4d62011-05-12 16:10:50 -07001675 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001676
1677 if (ret < 0)
1678 return ret;
1679
1680 dev->header.snapc->seq = new_snapid;
1681
1682 return 0;
1683bad:
1684 return -ERANGE;
1685}
1686
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001687static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1688{
1689 struct rbd_snap *snap;
1690
1691 while (!list_empty(&rbd_dev->snaps)) {
1692 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1693 __rbd_remove_snap_dev(rbd_dev, snap);
1694 }
1695}
1696
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001697/*
1698 * only read the first part of the ondisk header, without the snaps info
1699 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001700static int __rbd_update_snaps(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001701{
1702 int ret;
1703 struct rbd_image_header h;
1704 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001705 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001706
1707 ret = rbd_read_header(rbd_dev, &h);
1708 if (ret < 0)
1709 return ret;
1710
Sage Weil9db4b3e2011-04-19 22:49:06 -07001711 /* resized? */
1712 set_capacity(rbd_dev->disk, h.image_size / 512ULL);
1713
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001714 down_write(&rbd_dev->header.snap_rwsem);
1715
1716 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001717 if (rbd_dev->header.total_snaps &&
1718 rbd_dev->header.snapc->snaps[0] == snap_seq)
1719 /* pointing at the head, will need to follow that
1720 if head moves */
1721 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001722
1723 kfree(rbd_dev->header.snapc);
1724 kfree(rbd_dev->header.snap_names);
1725 kfree(rbd_dev->header.snap_sizes);
1726
1727 rbd_dev->header.total_snaps = h.total_snaps;
1728 rbd_dev->header.snapc = h.snapc;
1729 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001730 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001731 rbd_dev->header.snap_sizes = h.snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001732 if (follow_seq)
1733 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1734 else
1735 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001736
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001737 ret = __rbd_init_snaps_header(rbd_dev);
1738
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001739 up_write(&rbd_dev->header.snap_rwsem);
1740
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001741 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001742}
1743
1744static int rbd_init_disk(struct rbd_device *rbd_dev)
1745{
1746 struct gendisk *disk;
1747 struct request_queue *q;
1748 int rc;
1749 u64 total_size = 0;
1750
1751 /* contact OSD, request size info about the object being mapped */
1752 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1753 if (rc)
1754 return rc;
1755
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001756 /* no need to lock here, as rbd_dev is not registered yet */
1757 rc = __rbd_init_snaps_header(rbd_dev);
1758 if (rc)
1759 return rc;
1760
Josh Durgincc9d7342011-11-21 18:19:13 -08001761 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001762 if (rc)
1763 return rc;
1764
1765 /* create gendisk info */
1766 rc = -ENOMEM;
1767 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1768 if (!disk)
1769 goto out;
1770
Alex Elderf0f8cef2012-01-29 13:57:44 -06001771 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001772 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001773 disk->major = rbd_dev->major;
1774 disk->first_minor = 0;
1775 disk->fops = &rbd_bd_ops;
1776 disk->private_data = rbd_dev;
1777
1778 /* init rq */
1779 rc = -ENOMEM;
1780 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1781 if (!q)
1782 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001783
1784 /* set io sizes to object size */
1785 blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
1786 blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
1787 blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
1788 blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
1789
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001790 blk_queue_merge_bvec(q, rbd_merge_bvec);
1791 disk->queue = q;
1792
1793 q->queuedata = rbd_dev;
1794
1795 rbd_dev->disk = disk;
1796 rbd_dev->q = q;
1797
1798 /* finally, announce the disk to the world */
1799 set_capacity(disk, total_size / 512ULL);
1800 add_disk(disk);
1801
1802 pr_info("%s: added with size 0x%llx\n",
1803 disk->disk_name, (unsigned long long)total_size);
1804 return 0;
1805
1806out_disk:
1807 put_disk(disk);
1808out:
1809 return rc;
1810}
1811
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001812/*
1813 sysfs
1814*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001815
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001816static ssize_t rbd_size_show(struct device *dev,
1817 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001818{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001819 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1820
1821 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001822}
1823
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001824static ssize_t rbd_major_show(struct device *dev,
1825 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001826{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001827 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1828
1829 return sprintf(buf, "%d\n", rbd_dev->major);
1830}
1831
1832static ssize_t rbd_client_id_show(struct device *dev,
1833 struct device_attribute *attr, char *buf)
1834{
1835 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1836
Alex Elder1dbb4392012-01-24 10:08:37 -06001837 return sprintf(buf, "client%lld\n",
1838 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001839}
1840
1841static ssize_t rbd_pool_show(struct device *dev,
1842 struct device_attribute *attr, char *buf)
1843{
1844 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1845
1846 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1847}
1848
1849static ssize_t rbd_name_show(struct device *dev,
1850 struct device_attribute *attr, char *buf)
1851{
1852 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1853
1854 return sprintf(buf, "%s\n", rbd_dev->obj);
1855}
1856
1857static ssize_t rbd_snap_show(struct device *dev,
1858 struct device_attribute *attr,
1859 char *buf)
1860{
1861 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1862
1863 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1864}
1865
1866static ssize_t rbd_image_refresh(struct device *dev,
1867 struct device_attribute *attr,
1868 const char *buf,
1869 size_t size)
1870{
1871 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1872 int rc;
1873 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001874
1875 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1876
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001877 rc = __rbd_update_snaps(rbd_dev);
1878 if (rc < 0)
1879 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001880
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001881 mutex_unlock(&ctl_mutex);
1882 return ret;
1883}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001884
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001885static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1886static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1887static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1888static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1889static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1890static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1891static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1892static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001893
1894static struct attribute *rbd_attrs[] = {
1895 &dev_attr_size.attr,
1896 &dev_attr_major.attr,
1897 &dev_attr_client_id.attr,
1898 &dev_attr_pool.attr,
1899 &dev_attr_name.attr,
1900 &dev_attr_current_snap.attr,
1901 &dev_attr_refresh.attr,
1902 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001903 NULL
1904};
1905
1906static struct attribute_group rbd_attr_group = {
1907 .attrs = rbd_attrs,
1908};
1909
1910static const struct attribute_group *rbd_attr_groups[] = {
1911 &rbd_attr_group,
1912 NULL
1913};
1914
1915static void rbd_sysfs_dev_release(struct device *dev)
1916{
1917}
1918
1919static struct device_type rbd_device_type = {
1920 .name = "rbd",
1921 .groups = rbd_attr_groups,
1922 .release = rbd_sysfs_dev_release,
1923};
1924
1925
1926/*
1927 sysfs - snapshots
1928*/
1929
1930static ssize_t rbd_snap_size_show(struct device *dev,
1931 struct device_attribute *attr,
1932 char *buf)
1933{
1934 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1935
1936 return sprintf(buf, "%lld\n", (long long)snap->size);
1937}
1938
1939static ssize_t rbd_snap_id_show(struct device *dev,
1940 struct device_attribute *attr,
1941 char *buf)
1942{
1943 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1944
1945 return sprintf(buf, "%lld\n", (long long)snap->id);
1946}
1947
1948static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1949static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1950
1951static struct attribute *rbd_snap_attrs[] = {
1952 &dev_attr_snap_size.attr,
1953 &dev_attr_snap_id.attr,
1954 NULL,
1955};
1956
1957static struct attribute_group rbd_snap_attr_group = {
1958 .attrs = rbd_snap_attrs,
1959};
1960
1961static void rbd_snap_dev_release(struct device *dev)
1962{
1963 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1964 kfree(snap->name);
1965 kfree(snap);
1966}
1967
1968static const struct attribute_group *rbd_snap_attr_groups[] = {
1969 &rbd_snap_attr_group,
1970 NULL
1971};
1972
1973static struct device_type rbd_snap_device_type = {
1974 .groups = rbd_snap_attr_groups,
1975 .release = rbd_snap_dev_release,
1976};
1977
1978static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1979 struct rbd_snap *snap)
1980{
1981 list_del(&snap->node);
1982 device_unregister(&snap->dev);
1983}
1984
1985static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1986 struct rbd_snap *snap,
1987 struct device *parent)
1988{
1989 struct device *dev = &snap->dev;
1990 int ret;
1991
1992 dev->type = &rbd_snap_device_type;
1993 dev->parent = parent;
1994 dev->release = rbd_snap_dev_release;
1995 dev_set_name(dev, "snap_%s", snap->name);
1996 ret = device_register(dev);
1997
1998 return ret;
1999}
2000
2001static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2002 int i, const char *name,
2003 struct rbd_snap **snapp)
2004{
2005 int ret;
2006 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2007 if (!snap)
2008 return -ENOMEM;
2009 snap->name = kstrdup(name, GFP_KERNEL);
2010 snap->size = rbd_dev->header.snap_sizes[i];
2011 snap->id = rbd_dev->header.snapc->snaps[i];
2012 if (device_is_registered(&rbd_dev->dev)) {
2013 ret = rbd_register_snap_dev(rbd_dev, snap,
2014 &rbd_dev->dev);
2015 if (ret < 0)
2016 goto err;
2017 }
2018 *snapp = snap;
2019 return 0;
2020err:
2021 kfree(snap->name);
2022 kfree(snap);
2023 return ret;
2024}
2025
2026/*
2027 * search for the previous snap in a null delimited string list
2028 */
2029const char *rbd_prev_snap_name(const char *name, const char *start)
2030{
2031 if (name < start + 2)
2032 return NULL;
2033
2034 name -= 2;
2035 while (*name) {
2036 if (name == start)
2037 return start;
2038 name--;
2039 }
2040 return name + 1;
2041}
2042
2043/*
2044 * compare the old list of snapshots that we have to what's in the header
2045 * and update it accordingly. Note that the header holds the snapshots
2046 * in a reverse order (from newest to oldest) and we need to go from
2047 * older to new so that we don't get a duplicate snap name when
2048 * doing the process (e.g., removed snapshot and recreated a new
2049 * one with the same name.
2050 */
2051static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2052{
2053 const char *name, *first_name;
2054 int i = rbd_dev->header.total_snaps;
2055 struct rbd_snap *snap, *old_snap = NULL;
2056 int ret;
2057 struct list_head *p, *n;
2058
2059 first_name = rbd_dev->header.snap_names;
2060 name = first_name + rbd_dev->header.snap_names_len;
2061
2062 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2063 u64 cur_id;
2064
2065 old_snap = list_entry(p, struct rbd_snap, node);
2066
2067 if (i)
2068 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2069
2070 if (!i || old_snap->id < cur_id) {
2071 /* old_snap->id was skipped, thus was removed */
2072 __rbd_remove_snap_dev(rbd_dev, old_snap);
2073 continue;
2074 }
2075 if (old_snap->id == cur_id) {
2076 /* we have this snapshot already */
2077 i--;
2078 name = rbd_prev_snap_name(name, first_name);
2079 continue;
2080 }
2081 for (; i > 0;
2082 i--, name = rbd_prev_snap_name(name, first_name)) {
2083 if (!name) {
2084 WARN_ON(1);
2085 return -EINVAL;
2086 }
2087 cur_id = rbd_dev->header.snapc->snaps[i];
2088 /* snapshot removal? handle it above */
2089 if (cur_id >= old_snap->id)
2090 break;
2091 /* a new snapshot */
2092 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2093 if (ret < 0)
2094 return ret;
2095
2096 /* note that we add it backward so using n and not p */
2097 list_add(&snap->node, n);
2098 p = &snap->node;
2099 }
2100 }
2101 /* we're done going over the old snap list, just add what's left */
2102 for (; i > 0; i--) {
2103 name = rbd_prev_snap_name(name, first_name);
2104 if (!name) {
2105 WARN_ON(1);
2106 return -EINVAL;
2107 }
2108 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2109 if (ret < 0)
2110 return ret;
2111 list_add(&snap->node, &rbd_dev->snaps);
2112 }
2113
2114 return 0;
2115}
2116
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002117static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2118{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002119 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002120 struct device *dev;
2121 struct rbd_snap *snap;
2122
2123 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2124 dev = &rbd_dev->dev;
2125
2126 dev->bus = &rbd_bus_type;
2127 dev->type = &rbd_device_type;
2128 dev->parent = &rbd_root_dev;
2129 dev->release = rbd_dev_release;
2130 dev_set_name(dev, "%d", rbd_dev->id);
2131 ret = device_register(dev);
2132 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002133 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002134
2135 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2136 ret = rbd_register_snap_dev(rbd_dev, snap,
2137 &rbd_dev->dev);
2138 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002139 break;
2140 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002141out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002142 mutex_unlock(&ctl_mutex);
2143 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002144}
2145
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002146static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2147{
2148 device_unregister(&rbd_dev->dev);
2149}
2150
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002151static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2152{
2153 int ret, rc;
2154
2155 do {
2156 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2157 rbd_dev->header.obj_version);
2158 if (ret == -ERANGE) {
2159 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2160 rc = __rbd_update_snaps(rbd_dev);
2161 mutex_unlock(&ctl_mutex);
2162 if (rc < 0)
2163 return rc;
2164 }
2165 } while (ret == -ERANGE);
2166
2167 return ret;
2168}
2169
Alex Elder1ddbe942012-01-29 13:57:44 -06002170static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2171
2172/*
Alex Elder499afd52012-02-02 08:13:29 -06002173 * Get a unique rbd identifier for the given new rbd_dev, and add
2174 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002175 */
Alex Elder499afd52012-02-02 08:13:29 -06002176static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002177{
Alex Elder499afd52012-02-02 08:13:29 -06002178 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2179
2180 spin_lock(&rbd_dev_list_lock);
2181 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2182 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002183}
Alex Elderb7f23c32012-01-29 13:57:43 -06002184
Alex Elder1ddbe942012-01-29 13:57:44 -06002185/*
Alex Elder499afd52012-02-02 08:13:29 -06002186 * Remove an rbd_dev from the global list, and record that its
2187 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002188 */
Alex Elder499afd52012-02-02 08:13:29 -06002189static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002190{
Alex Elderd184f6b2012-01-29 13:57:44 -06002191 struct list_head *tmp;
2192 int rbd_id = rbd_dev->id;
2193 int max_id;
2194
2195 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002196
2197 spin_lock(&rbd_dev_list_lock);
2198 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002199
2200 /*
2201 * If the id being "put" is not the current maximum, there
2202 * is nothing special we need to do.
2203 */
2204 if (rbd_id != atomic64_read(&rbd_id_max)) {
2205 spin_unlock(&rbd_dev_list_lock);
2206 return;
2207 }
2208
2209 /*
2210 * We need to update the current maximum id. Search the
2211 * list to find out what it is. We're more likely to find
2212 * the maximum at the end, so search the list backward.
2213 */
2214 max_id = 0;
2215 list_for_each_prev(tmp, &rbd_dev_list) {
2216 struct rbd_device *rbd_dev;
2217
2218 rbd_dev = list_entry(tmp, struct rbd_device, node);
2219 if (rbd_id > max_id)
2220 max_id = rbd_id;
2221 }
Alex Elder499afd52012-02-02 08:13:29 -06002222 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002223
Alex Elder1ddbe942012-01-29 13:57:44 -06002224 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002225 * The max id could have been updated by rbd_id_get(), in
2226 * which case it now accurately reflects the new maximum.
2227 * Be careful not to overwrite the maximum value in that
2228 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002229 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002230 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002231}
2232
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002233static ssize_t rbd_add(struct bus_type *bus,
2234 const char *buf,
2235 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002236{
2237 struct ceph_osd_client *osdc;
2238 struct rbd_device *rbd_dev;
2239 ssize_t rc = -ENOMEM;
Alex Elderb7f23c32012-01-29 13:57:43 -06002240 int irc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002241 char *mon_dev_name;
2242 char *options;
2243
2244 if (!try_module_get(THIS_MODULE))
2245 return -ENODEV;
2246
2247 mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2248 if (!mon_dev_name)
2249 goto err_out_mod;
2250
2251 options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2252 if (!options)
2253 goto err_mon_dev;
2254
2255 /* new rbd_device object */
2256 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2257 if (!rbd_dev)
2258 goto err_out_opt;
2259
2260 /* static rbd_device initialization */
2261 spin_lock_init(&rbd_dev->lock);
2262 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002263 INIT_LIST_HEAD(&rbd_dev->snaps);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002264
Alex Elder0e805a12012-01-11 19:42:15 -08002265 init_rwsem(&rbd_dev->header.snap_rwsem);
2266
Alex Elderd184f6b2012-01-29 13:57:44 -06002267 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002268 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002269
2270 /* parse add command */
2271 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
2272 "%" __stringify(RBD_MAX_OPT_LEN) "s "
2273 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
2274 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
2275 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
2276 mon_dev_name, options, rbd_dev->pool_name,
2277 rbd_dev->obj, rbd_dev->snap_name) < 4) {
2278 rc = -EINVAL;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002279 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002280 }
2281
2282 if (rbd_dev->snap_name[0] == 0)
Josh Durgincc9d7342011-11-21 18:19:13 -08002283 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2284 sizeof (RBD_SNAP_HEAD_NAME));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002285
2286 rbd_dev->obj_len = strlen(rbd_dev->obj);
2287 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
2288 rbd_dev->obj, RBD_SUFFIX);
2289
2290 /* initialize rest of new object */
Alex Elderf0f8cef2012-01-29 13:57:44 -06002291 snprintf(rbd_dev->name, DEV_NAME_LEN, RBD_DRV_NAME "%d", rbd_dev->id);
Alex Eldere124a82f2012-01-29 13:57:44 -06002292
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002293 rc = rbd_get_client(rbd_dev, mon_dev_name, options);
2294 if (rc < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002295 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002296
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002297 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002298 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002299 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2300 if (rc < 0)
2301 goto err_out_client;
2302 rbd_dev->poolid = rc;
2303
2304 /* register our block device */
2305 irc = register_blkdev(0, rbd_dev->name);
2306 if (irc < 0) {
2307 rc = irc;
2308 goto err_out_client;
2309 }
2310 rbd_dev->major = irc;
2311
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002312 rc = rbd_bus_add_dev(rbd_dev);
2313 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002314 goto err_out_blkdev;
2315
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002316 /* set up and announce blkdev mapping */
2317 rc = rbd_init_disk(rbd_dev);
2318 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002319 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002320
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002321 rc = rbd_init_watch_dev(rbd_dev);
2322 if (rc)
2323 goto err_out_bus;
2324
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002325 return count;
2326
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002327err_out_bus:
Alex Elder499afd52012-02-02 08:13:29 -06002328 rbd_id_put(rbd_dev);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002329
2330 /* this will also clean up rest of rbd_dev stuff */
2331
2332 rbd_bus_del_dev(rbd_dev);
2333 kfree(options);
2334 kfree(mon_dev_name);
2335 return rc;
2336
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002337err_out_blkdev:
2338 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2339err_out_client:
2340 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002341err_put_id:
Alex Elder499afd52012-02-02 08:13:29 -06002342 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002343 kfree(rbd_dev);
2344err_out_opt:
2345 kfree(options);
2346err_mon_dev:
2347 kfree(mon_dev_name);
2348err_out_mod:
2349 dout("Error adding device %s\n", buf);
2350 module_put(THIS_MODULE);
2351 return rc;
2352}
2353
2354static struct rbd_device *__rbd_get_dev(unsigned long id)
2355{
2356 struct list_head *tmp;
2357 struct rbd_device *rbd_dev;
2358
Alex Eldere124a82f2012-01-29 13:57:44 -06002359 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002360 list_for_each(tmp, &rbd_dev_list) {
2361 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a82f2012-01-29 13:57:44 -06002362 if (rbd_dev->id == id) {
2363 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002364 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002365 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002366 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002367 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002368 return NULL;
2369}
2370
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002371static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002372{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002373 struct rbd_device *rbd_dev =
2374 container_of(dev, struct rbd_device, dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002375
Alex Elder1dbb4392012-01-24 10:08:37 -06002376 if (rbd_dev->watch_request) {
2377 struct ceph_client *client = rbd_dev->rbd_client->client;
2378
2379 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002380 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002381 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002382 if (rbd_dev->watch_event)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07002383 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002384
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002385 rbd_put_client(rbd_dev);
2386
2387 /* clean up and free blkdev */
2388 rbd_free_disk(rbd_dev);
2389 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2390 kfree(rbd_dev);
2391
2392 /* release module ref */
2393 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002394}
2395
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002396static ssize_t rbd_remove(struct bus_type *bus,
2397 const char *buf,
2398 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002399{
2400 struct rbd_device *rbd_dev = NULL;
2401 int target_id, rc;
2402 unsigned long ul;
2403 int ret = count;
2404
2405 rc = strict_strtoul(buf, 10, &ul);
2406 if (rc)
2407 return rc;
2408
2409 /* convert to int; abort if we lost anything in the conversion */
2410 target_id = (int) ul;
2411 if (target_id != ul)
2412 return -EINVAL;
2413
2414 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2415
2416 rbd_dev = __rbd_get_dev(target_id);
2417 if (!rbd_dev) {
2418 ret = -ENOENT;
2419 goto done;
2420 }
2421
Alex Elder499afd52012-02-02 08:13:29 -06002422 rbd_id_put(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002423
2424 __rbd_remove_all_snaps(rbd_dev);
2425 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002426
2427done:
2428 mutex_unlock(&ctl_mutex);
2429 return ret;
2430}
2431
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002432static ssize_t rbd_snap_add(struct device *dev,
2433 struct device_attribute *attr,
2434 const char *buf,
2435 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002436{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002437 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2438 int ret;
2439 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002440 if (!name)
2441 return -ENOMEM;
2442
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002443 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002444
2445 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2446
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002447 ret = rbd_header_add_snap(rbd_dev,
2448 name, GFP_KERNEL);
2449 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002450 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002451
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002452 ret = __rbd_update_snaps(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002453 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002454 goto err_unlock;
2455
2456 /* shouldn't hold ctl_mutex when notifying.. notify might
2457 trigger a watch callback that would need to get that mutex */
2458 mutex_unlock(&ctl_mutex);
2459
2460 /* make a best effort, don't error if failed */
2461 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002462
2463 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002464 kfree(name);
2465 return ret;
2466
2467err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002468 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002469 kfree(name);
2470 return ret;
2471}
2472
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002473/*
2474 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002475 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002476 */
2477static int rbd_sysfs_init(void)
2478{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002479 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002480
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002481 ret = bus_register(&rbd_bus_type);
Alex Elder21079782012-01-24 10:08:36 -06002482 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002483 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002484
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002485 ret = device_register(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002486
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002487 return ret;
2488}
2489
2490static void rbd_sysfs_cleanup(void)
2491{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002492 device_unregister(&rbd_root_dev);
2493 bus_unregister(&rbd_bus_type);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002494}
2495
2496int __init rbd_init(void)
2497{
2498 int rc;
2499
2500 rc = rbd_sysfs_init();
2501 if (rc)
2502 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002503 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002504 return 0;
2505}
2506
2507void __exit rbd_exit(void)
2508{
2509 rbd_sysfs_cleanup();
2510}
2511
2512module_init(rbd_init);
2513module_exit(rbd_exit);
2514
2515MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2516MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2517MODULE_DESCRIPTION("rados block device");
2518
2519/* following authorship retained from original osdblk.c */
2520MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2521
2522MODULE_LICENSE("GPL");