blob: de1520ccc0d46afdf5bbfefffa33e83913ce694c [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
34#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070035#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050036#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070037
38#include <linux/kernel.h>
39#include <linux/device.h>
40#include <linux/module.h>
41#include <linux/fs.h>
42#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050043#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020044#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040045#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070046
47#include "rbd_types.h"
48
Alex Elderaafb2302012-09-06 16:00:54 -050049#define RBD_DEBUG /* Activate rbd_assert() calls */
50
Alex Elder593a9e72012-02-07 12:03:37 -060051/*
52 * The basic unit of block I/O is a sector. It is interpreted in a
53 * number of contexts in Linux (blk, bio, genhd), but the default is
54 * universally 512 bytes. These symbols are just slightly more
55 * meaningful than the bare numbers they represent.
56 */
57#define SECTOR_SHIFT 9
58#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
59
Alex Eldera2acd002013-05-08 22:50:04 -050060/*
61 * Increment the given counter and return its updated value.
62 * If the counter is already 0 it will not be incremented.
63 * If the counter is already at its maximum value returns
64 * -EINVAL without updating it.
65 */
66static int atomic_inc_return_safe(atomic_t *v)
67{
68 unsigned int counter;
69
70 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
71 if (counter <= (unsigned int)INT_MAX)
72 return (int)counter;
73
74 atomic_dec(v);
75
76 return -EINVAL;
77}
78
79/* Decrement the counter. Return the resulting value, or -EINVAL */
80static int atomic_dec_return_safe(atomic_t *v)
81{
82 int counter;
83
84 counter = atomic_dec_return(v);
85 if (counter >= 0)
86 return counter;
87
88 atomic_inc(v);
89
90 return -EINVAL;
91}
92
Alex Elderf0f8cef2012-01-29 13:57:44 -060093#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070094
Ilya Dryomov7e513d42013-12-16 19:26:32 +020095#define RBD_MINORS_PER_MAJOR 256
96#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
Alex Elderd4b125e2012-07-03 16:01:19 -050098#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
99#define RBD_MAX_SNAP_NAME_LEN \
100 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
101
Alex Elder35d489f2012-07-03 16:01:19 -0500102#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103
104#define RBD_SNAP_HEAD_NAME "-"
105
Alex Elder9682fc62013-04-30 00:44:33 -0500106#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
107
Alex Elder9e15b772012-10-30 19:40:33 -0500108/* This allows a single page to hold an image name sent by OSD */
109#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500110#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500111
Alex Elder1e130192012-07-03 16:01:19 -0500112#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500113
Alex Elderd8891402012-10-09 13:50:17 -0700114/* Feature bits */
115
Alex Elder5cbf6f122013-04-11 09:29:48 -0500116#define RBD_FEATURE_LAYERING (1<<0)
117#define RBD_FEATURE_STRIPINGV2 (1<<1)
118#define RBD_FEATURES_ALL \
119 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
Alex Elderd8891402012-10-09 13:50:17 -0700120
121/* Features supported by this (client software) implementation. */
122
Alex Elder770eba62012-10-25 23:34:40 -0500123#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700124
Alex Elder81a89792012-02-02 08:13:30 -0600125/*
126 * An RBD device name will be "rbd#", where the "rbd" comes from
127 * RBD_DRV_NAME above, and # is a unique integer identifier.
128 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
129 * enough to hold all possible device names.
130 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700131#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -0600132#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700133
134/*
135 * block device image metadata (in-memory version)
136 */
137struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500138 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500139 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700140 __u8 obj_order;
141 __u8 crypt_type;
142 __u8 comp_type;
Alex Elderf35a4de2013-05-06 09:51:29 -0500143 u64 stripe_unit;
144 u64 stripe_count;
145 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146
Alex Elderf84344f2012-08-31 17:29:51 -0500147 /* The remaining fields need to be updated occasionally */
148 u64 image_size;
149 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500150 char *snap_names; /* format 1 only */
151 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700152};
153
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500154/*
155 * An rbd image specification.
156 *
157 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500158 * identify an image. Each rbd_dev structure includes a pointer to
159 * an rbd_spec structure that encapsulates this identity.
160 *
161 * Each of the id's in an rbd_spec has an associated name. For a
162 * user-mapped image, the names are supplied and the id's associated
163 * with them are looked up. For a layered image, a parent image is
164 * defined by the tuple, and the names are looked up.
165 *
166 * An rbd_dev structure contains a parent_spec pointer which is
167 * non-null if the image it represents is a child in a layered
168 * image. This pointer will refer to the rbd_spec structure used
169 * by the parent rbd_dev for its own identity (i.e., the structure
170 * is shared between the parent and child).
171 *
172 * Since these structures are populated once, during the discovery
173 * phase of image construction, they are effectively immutable so
174 * we make no effort to synchronize access to them.
175 *
176 * Note that code herein does not assume the image name is known (it
177 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500178 */
179struct rbd_spec {
180 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500181 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500182
Alex Elderecb4dc22013-04-26 09:43:47 -0500183 const char *image_id;
184 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500185
186 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500187 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500188
189 struct kref kref;
190};
191
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700192/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600193 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700194 */
195struct rbd_client {
196 struct ceph_client *client;
197 struct kref kref;
198 struct list_head node;
199};
200
Alex Elderbf0d5f502012-11-22 00:00:08 -0600201struct rbd_img_request;
202typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
203
204#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
205
206struct rbd_obj_request;
207typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
208
Alex Elder9969ebc2013-01-18 12:31:10 -0600209enum obj_request_type {
210 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
211};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800213enum obj_operation_type {
214 OBJ_OP_WRITE,
215 OBJ_OP_READ,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800216 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800217};
218
Alex Elder926f9b32013-02-11 12:33:24 -0600219enum obj_req_flags {
220 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600221 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600222 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
223 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600224};
225
Alex Elderbf0d5f502012-11-22 00:00:08 -0600226struct rbd_obj_request {
227 const char *object_name;
228 u64 offset; /* object start byte */
229 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600230 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600231
Alex Elderc5b5ef62013-02-11 12:33:24 -0600232 /*
233 * An object request associated with an image will have its
234 * img_data flag set; a standalone object request will not.
235 *
236 * A standalone object request will have which == BAD_WHICH
237 * and a null obj_request pointer.
238 *
239 * An object request initiated in support of a layered image
240 * object (to check for its existence before a write) will
241 * have which == BAD_WHICH and a non-null obj_request pointer.
242 *
243 * Finally, an object request for rbd image data will have
244 * which != BAD_WHICH, and will have a non-null img_request
245 * pointer. The value of which will be in the range
246 * 0..(img_request->obj_request_count-1).
247 */
248 union {
249 struct rbd_obj_request *obj_request; /* STAT op */
250 struct {
251 struct rbd_img_request *img_request;
252 u64 img_offset;
253 /* links for img_request->obj_requests list */
254 struct list_head links;
255 };
256 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600257 u32 which; /* posn image request list */
258
259 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600260 union {
261 struct bio *bio_list;
262 struct {
263 struct page **pages;
264 u32 page_count;
265 };
266 };
Alex Elder0eefd472013-04-19 15:34:50 -0500267 struct page **copyup_pages;
Alex Elderebda6402013-05-10 16:29:22 -0500268 u32 copyup_page_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600269
270 struct ceph_osd_request *osd_req;
271
272 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800273 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600274
275 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600276 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600277
278 struct kref kref;
279};
280
Alex Elder0c425242013-02-08 09:55:49 -0600281enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600282 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
283 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600284 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800285 IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600286};
287
Alex Elderbf0d5f502012-11-22 00:00:08 -0600288struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600289 struct rbd_device *rbd_dev;
290 u64 offset; /* starting image byte offset */
291 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600292 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293 union {
Alex Elder9849e982013-01-24 16:13:36 -0600294 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600295 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600296 };
297 union {
298 struct request *rq; /* block request */
299 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600300 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500301 struct page **copyup_pages;
Alex Elderebda6402013-05-10 16:29:22 -0500302 u32 copyup_page_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600303 spinlock_t completion_lock;/* protects next_completion */
304 u32 next_completion;
305 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500306 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600307 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600308
309 u32 obj_request_count;
310 struct list_head obj_requests; /* rbd_obj_request structs */
311
312 struct kref kref;
313};
314
315#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600316 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600317#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600318 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600319#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600320 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600321
Alex Elderf84344f2012-08-31 17:29:51 -0500322struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500323 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500324 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500325 bool read_only;
326};
327
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328/*
329 * a single device
330 */
331struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500332 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333
334 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200335 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700337
Alex Eldera30b71b2012-07-10 20:30:11 -0500338 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 struct rbd_client *rbd_client;
340
341 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
342
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +0400343 struct list_head rq_queue; /* incoming rq queue */
Alex Elderb82d1672013-01-14 12:43:31 -0600344 spinlock_t lock; /* queue, flags, open_count */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +0400345 struct workqueue_struct *rq_wq;
346 struct work_struct rq_work;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347
348 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600349 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500350 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500352 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500353
Alex Elder0903e872012-11-14 12:25:19 -0600354 struct ceph_file_layout layout;
355
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700356 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600357 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700358
Alex Elder86b00e02012-10-25 23:34:42 -0500359 struct rbd_spec *parent_spec;
360 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500361 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500362 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500363
Josh Durginc6666012011-11-21 17:11:12 -0800364 /* protects updating the header */
365 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500366
367 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700368
369 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800370
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800371 /* sysfs related */
372 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600373 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800374};
375
Alex Elderb82d1672013-01-14 12:43:31 -0600376/*
377 * Flag bits for rbd_dev->flags. If atomicity is required,
378 * rbd_dev->lock is used to protect access.
379 *
380 * Currently, only the "removing" flag (which is coupled with the
381 * "open_count" field) requires atomic access.
382 */
Alex Elder6d292902013-01-14 12:43:31 -0600383enum rbd_dev_flags {
384 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600385 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600386};
387
Alex Eldercfbf6372013-05-31 17:40:45 -0500388static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600389
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700390static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600391static DEFINE_SPINLOCK(rbd_dev_list_lock);
392
Alex Elder432b8582012-01-29 13:57:44 -0600393static LIST_HEAD(rbd_client_list); /* clients */
394static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700395
Alex Elder78c2a442013-05-01 12:43:04 -0500396/* Slab caches for frequently-allocated structures */
397
Alex Elder1c2a9df2013-05-01 12:43:03 -0500398static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500399static struct kmem_cache *rbd_obj_request_cache;
Alex Elder78c2a442013-05-01 12:43:04 -0500400static struct kmem_cache *rbd_segment_name_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500401
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200402static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200403static DEFINE_IDA(rbd_dev_id_ida);
404
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200405/*
406 * Default to false for now, as single-major requires >= 0.75 version of
407 * userspace rbd utility.
408 */
409static bool single_major = false;
410module_param(single_major, bool, S_IRUGO);
411MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
412
Alex Elder3d7efd12013-04-19 15:34:50 -0500413static int rbd_img_request_submit(struct rbd_img_request *img_request);
414
Alex Elder200a6a82013-04-28 23:32:34 -0500415static void rbd_dev_device_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800416
Alex Elderf0f8cef2012-01-29 13:57:44 -0600417static ssize_t rbd_add(struct bus_type *bus, const char *buf,
418 size_t count);
419static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
420 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200421static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
422 size_t count);
423static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
424 size_t count);
Alex Elder1f3ef782013-05-06 17:40:33 -0500425static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
Alex Eldera2acd002013-05-08 22:50:04 -0500426static void rbd_spec_put(struct rbd_spec *spec);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600427
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200428static int rbd_dev_id_to_minor(int dev_id)
429{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200430 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200431}
432
433static int minor_to_rbd_dev_id(int minor)
434{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200435 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200436}
437
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700438static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
439static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200440static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
441static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700442
443static struct attribute *rbd_bus_attrs[] = {
444 &bus_attr_add.attr,
445 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200446 &bus_attr_add_single_major.attr,
447 &bus_attr_remove_single_major.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700448 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600449};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200450
451static umode_t rbd_bus_is_visible(struct kobject *kobj,
452 struct attribute *attr, int index)
453{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200454 if (!single_major &&
455 (attr == &bus_attr_add_single_major.attr ||
456 attr == &bus_attr_remove_single_major.attr))
457 return 0;
458
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200459 return attr->mode;
460}
461
462static const struct attribute_group rbd_bus_group = {
463 .attrs = rbd_bus_attrs,
464 .is_visible = rbd_bus_is_visible,
465};
466__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600467
468static struct bus_type rbd_bus_type = {
469 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700470 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600471};
472
473static void rbd_root_dev_release(struct device *dev)
474{
475}
476
477static struct device rbd_root_dev = {
478 .init_name = "rbd",
479 .release = rbd_root_dev_release,
480};
481
Alex Elder06ecc6c2012-11-01 10:17:15 -0500482static __printf(2, 3)
483void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
484{
485 struct va_format vaf;
486 va_list args;
487
488 va_start(args, fmt);
489 vaf.fmt = fmt;
490 vaf.va = &args;
491
492 if (!rbd_dev)
493 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
494 else if (rbd_dev->disk)
495 printk(KERN_WARNING "%s: %s: %pV\n",
496 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
497 else if (rbd_dev->spec && rbd_dev->spec->image_name)
498 printk(KERN_WARNING "%s: image %s: %pV\n",
499 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
500 else if (rbd_dev->spec && rbd_dev->spec->image_id)
501 printk(KERN_WARNING "%s: id %s: %pV\n",
502 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
503 else /* punt */
504 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
505 RBD_DRV_NAME, rbd_dev, &vaf);
506 va_end(args);
507}
508
Alex Elderaafb2302012-09-06 16:00:54 -0500509#ifdef RBD_DEBUG
510#define rbd_assert(expr) \
511 if (unlikely(!(expr))) { \
512 printk(KERN_ERR "\nAssertion failure in %s() " \
513 "at line %d:\n\n" \
514 "\trbd_assert(%s);\n\n", \
515 __func__, __LINE__, #expr); \
516 BUG(); \
517 }
518#else /* !RBD_DEBUG */
519# define rbd_assert(expr) ((void) 0)
520#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800521
Alex Elderb454e362013-04-19 15:34:50 -0500522static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder05a46af2013-04-26 15:44:36 -0500523static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
524static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600525
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500526static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500527static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400528static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400529static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500530static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
531 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500532static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
533 u8 *order, u64 *snap_size);
534static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
535 u64 *snap_features);
536static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700537
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700538static int rbd_open(struct block_device *bdev, fmode_t mode)
539{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600540 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600541 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700542
Alex Elderf84344f2012-08-31 17:29:51 -0500543 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700544 return -EROFS;
545
Alex Eldera14ea262013-02-05 13:23:12 -0600546 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600547 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
548 removing = true;
549 else
550 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600551 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600552 if (removing)
553 return -ENOENT;
554
Alex Elderc3e946c2012-11-16 09:29:16 -0600555 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700556
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557 return 0;
558}
559
Al Virodb2a1442013-05-05 21:52:57 -0400560static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800561{
562 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600563 unsigned long open_count_before;
564
Alex Eldera14ea262013-02-05 13:23:12 -0600565 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600566 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600567 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600568 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800569
Alex Elderc3e946c2012-11-16 09:29:16 -0600570 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800571}
572
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800573static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
574{
Josh Durgin77f33c02013-09-30 17:09:54 -0700575 int ret = 0;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800576 int val;
577 bool ro;
Josh Durgin77f33c02013-09-30 17:09:54 -0700578 bool ro_changed = false;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800579
Josh Durgin77f33c02013-09-30 17:09:54 -0700580 /* get_user() may sleep, so call it before taking rbd_dev->lock */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800581 if (get_user(val, (int __user *)(arg)))
582 return -EFAULT;
583
584 ro = val ? true : false;
585 /* Snapshot doesn't allow to write*/
586 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
587 return -EROFS;
588
Josh Durgin77f33c02013-09-30 17:09:54 -0700589 spin_lock_irq(&rbd_dev->lock);
590 /* prevent others open this device */
591 if (rbd_dev->open_count > 1) {
592 ret = -EBUSY;
593 goto out;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800594 }
595
Josh Durgin77f33c02013-09-30 17:09:54 -0700596 if (rbd_dev->mapping.read_only != ro) {
597 rbd_dev->mapping.read_only = ro;
598 ro_changed = true;
599 }
600
601out:
602 spin_unlock_irq(&rbd_dev->lock);
603 /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
604 if (ret == 0 && ro_changed)
605 set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
606
607 return ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800608}
609
610static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
611 unsigned int cmd, unsigned long arg)
612{
613 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
614 int ret = 0;
615
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800616 switch (cmd) {
617 case BLKROSET:
618 ret = rbd_ioctl_set_ro(rbd_dev, arg);
619 break;
620 default:
621 ret = -ENOTTY;
622 }
623
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800624 return ret;
625}
626
627#ifdef CONFIG_COMPAT
628static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
629 unsigned int cmd, unsigned long arg)
630{
631 return rbd_ioctl(bdev, mode, cmd, arg);
632}
633#endif /* CONFIG_COMPAT */
634
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700635static const struct block_device_operations rbd_bd_ops = {
636 .owner = THIS_MODULE,
637 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800638 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800639 .ioctl = rbd_ioctl,
640#ifdef CONFIG_COMPAT
641 .compat_ioctl = rbd_compat_ioctl,
642#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643};
644
645/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500646 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500647 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648 */
Alex Elderf8c38922012-08-10 13:12:07 -0700649static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650{
651 struct rbd_client *rbdc;
652 int ret = -ENOMEM;
653
Alex Elder37206ee2013-02-20 17:32:08 -0600654 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700655 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
656 if (!rbdc)
657 goto out_opt;
658
659 kref_init(&rbdc->kref);
660 INIT_LIST_HEAD(&rbdc->node);
661
Alex Elder43ae4702012-07-03 16:01:18 -0500662 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500664 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500665 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666
667 ret = ceph_open_session(rbdc->client);
668 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500669 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700670
Alex Elder432b8582012-01-29 13:57:44 -0600671 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700672 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600673 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674
Alex Elder37206ee2013-02-20 17:32:08 -0600675 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600676
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500678out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500680out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681 kfree(rbdc);
682out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500683 if (ceph_opts)
684 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600685 dout("%s: error %d\n", __func__, ret);
686
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400687 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700688}
689
Alex Elder2f82ee52012-10-30 19:40:33 -0500690static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
691{
692 kref_get(&rbdc->kref);
693
694 return rbdc;
695}
696
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700698 * Find a ceph client with specific addr and configuration. If
699 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700701static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702{
703 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700704 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705
Alex Elder43ae4702012-07-03 16:01:18 -0500706 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707 return NULL;
708
Alex Elder1f7ba332012-08-10 13:12:07 -0700709 spin_lock(&rbd_client_list_lock);
710 list_for_each_entry(client_node, &rbd_client_list, node) {
711 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500712 __rbd_get_client(client_node);
713
Alex Elder1f7ba332012-08-10 13:12:07 -0700714 found = true;
715 break;
716 }
717 }
718 spin_unlock(&rbd_client_list_lock);
719
720 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721}
722
723/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700724 * mount options
725 */
726enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700727 Opt_last_int,
728 /* int args above */
729 Opt_last_string,
730 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700731 Opt_read_only,
732 Opt_read_write,
733 /* Boolean args above */
734 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700735};
736
Alex Elder43ae4702012-07-03 16:01:18 -0500737static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700738 /* int args above */
739 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500740 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700741 {Opt_read_only, "ro"}, /* Alternate spelling */
742 {Opt_read_write, "read_write"},
743 {Opt_read_write, "rw"}, /* Alternate spelling */
744 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700745 {-1, NULL}
746};
747
Alex Elder98571b52013-01-20 14:44:42 -0600748struct rbd_options {
749 bool read_only;
750};
751
752#define RBD_READ_ONLY_DEFAULT false
753
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700754static int parse_rbd_opts_token(char *c, void *private)
755{
Alex Elder43ae4702012-07-03 16:01:18 -0500756 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700757 substring_t argstr[MAX_OPT_ARGS];
758 int token, intval, ret;
759
Alex Elder43ae4702012-07-03 16:01:18 -0500760 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700761 if (token < 0)
762 return -EINVAL;
763
764 if (token < Opt_last_int) {
765 ret = match_int(&argstr[0], &intval);
766 if (ret < 0) {
767 pr_err("bad mount option arg (not int) "
768 "at '%s'\n", c);
769 return ret;
770 }
771 dout("got int token %d val %d\n", token, intval);
772 } else if (token > Opt_last_int && token < Opt_last_string) {
773 dout("got string token %d val %s\n", token,
774 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700775 } else if (token > Opt_last_string && token < Opt_last_bool) {
776 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700777 } else {
778 dout("got token %d\n", token);
779 }
780
781 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700782 case Opt_read_only:
783 rbd_opts->read_only = true;
784 break;
785 case Opt_read_write:
786 rbd_opts->read_only = false;
787 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700788 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500789 rbd_assert(false);
790 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700791 }
792 return 0;
793}
794
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800795static char* obj_op_name(enum obj_operation_type op_type)
796{
797 switch (op_type) {
798 case OBJ_OP_READ:
799 return "read";
800 case OBJ_OP_WRITE:
801 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800802 case OBJ_OP_DISCARD:
803 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800804 default:
805 return "???";
806 }
807}
808
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700809/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700810 * Get a ceph client with specific addr and configuration, if one does
Alex Elder7262cfc2013-05-16 15:04:20 -0500811 * not exist create it. Either way, ceph_opts is consumed by this
812 * function.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700813 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500814static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700815{
Alex Elderf8c38922012-08-10 13:12:07 -0700816 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700817
Alex Eldercfbf6372013-05-31 17:40:45 -0500818 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
Alex Elder1f7ba332012-08-10 13:12:07 -0700819 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500820 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500821 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500822 else
Alex Elderf8c38922012-08-10 13:12:07 -0700823 rbdc = rbd_client_create(ceph_opts);
Alex Eldercfbf6372013-05-31 17:40:45 -0500824 mutex_unlock(&client_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700825
Alex Elder9d3997f2012-10-25 23:34:42 -0500826 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700827}
828
829/*
830 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600831 *
Alex Elder432b8582012-01-29 13:57:44 -0600832 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700833 */
834static void rbd_client_release(struct kref *kref)
835{
836 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
837
Alex Elder37206ee2013-02-20 17:32:08 -0600838 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500839 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700840 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500841 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700842
843 ceph_destroy_client(rbdc->client);
844 kfree(rbdc);
845}
846
847/*
848 * Drop reference to ceph client node. If it's not referenced anymore, release
849 * it.
850 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500851static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700852{
Alex Elderc53d5892012-10-25 23:34:42 -0500853 if (rbdc)
854 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700855}
856
Alex Eldera30b71b2012-07-10 20:30:11 -0500857static bool rbd_image_format_valid(u32 image_format)
858{
859 return image_format == 1 || image_format == 2;
860}
861
Alex Elder8e94af82012-07-25 09:32:40 -0500862static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
863{
Alex Elder103a1502012-08-02 11:29:45 -0500864 size_t size;
865 u32 snap_count;
866
867 /* The header has to start with the magic rbd header text */
868 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
869 return false;
870
Alex Elderdb2388b2012-10-20 22:17:27 -0500871 /* The bio layer requires at least sector-sized I/O */
872
873 if (ondisk->options.order < SECTOR_SHIFT)
874 return false;
875
876 /* If we use u64 in a few spots we may be able to loosen this */
877
878 if (ondisk->options.order > 8 * sizeof (int) - 1)
879 return false;
880
Alex Elder103a1502012-08-02 11:29:45 -0500881 /*
882 * The size of a snapshot header has to fit in a size_t, and
883 * that limits the number of snapshots.
884 */
885 snap_count = le32_to_cpu(ondisk->snap_count);
886 size = SIZE_MAX - sizeof (struct ceph_snap_context);
887 if (snap_count > size / sizeof (__le64))
888 return false;
889
890 /*
891 * Not only that, but the size of the entire the snapshot
892 * header must also be representable in a size_t.
893 */
894 size -= snap_count * sizeof (__le64);
895 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
896 return false;
897
898 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500899}
900
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901/*
Alex Elderbb23e372013-05-06 09:51:29 -0500902 * Fill an rbd image header with information from the given format 1
903 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700904 */
Alex Elder662518b2013-05-06 09:51:29 -0500905static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -0500906 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700907{
Alex Elder662518b2013-05-06 09:51:29 -0500908 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -0500909 bool first_time = header->object_prefix == NULL;
910 struct ceph_snap_context *snapc;
911 char *object_prefix = NULL;
912 char *snap_names = NULL;
913 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500914 u32 snap_count;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500915 size_t size;
Alex Elderbb23e372013-05-06 09:51:29 -0500916 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -0500917 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700918
Alex Elderbb23e372013-05-06 09:51:29 -0500919 /* Allocate this now to avoid having to handle failure below */
920
921 if (first_time) {
922 size_t len;
923
924 len = strnlen(ondisk->object_prefix,
925 sizeof (ondisk->object_prefix));
926 object_prefix = kmalloc(len + 1, GFP_KERNEL);
927 if (!object_prefix)
928 return -ENOMEM;
929 memcpy(object_prefix, ondisk->object_prefix, len);
930 object_prefix[len] = '\0';
931 }
932
933 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -0500934
Alex Elder103a1502012-08-02 11:29:45 -0500935 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -0500936 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
937 if (!snapc)
938 goto out_err;
939 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700940 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -0500941 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -0500942 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
943
Alex Elderbb23e372013-05-06 09:51:29 -0500944 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -0500945
Alex Elderbb23e372013-05-06 09:51:29 -0500946 if (snap_names_len > (u64)SIZE_MAX)
947 goto out_2big;
948 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
949 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500950 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -0500951
952 /* ...as well as the array of their sizes. */
953
954 size = snap_count * sizeof (*header->snap_sizes);
955 snap_sizes = kmalloc(size, GFP_KERNEL);
956 if (!snap_sizes)
957 goto out_err;
958
Alex Elderf785cc12012-08-23 23:22:06 -0500959 /*
Alex Elderbb23e372013-05-06 09:51:29 -0500960 * Copy the names, and fill in each snapshot's id
961 * and size.
962 *
Alex Elder99a41eb2013-05-06 09:51:30 -0500963 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -0500964 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -0500965 * snap_names_len bytes beyond the end of the
966 * snapshot id array, this memcpy() is safe.
967 */
Alex Elderbb23e372013-05-06 09:51:29 -0500968 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
969 snaps = ondisk->snaps;
970 for (i = 0; i < snap_count; i++) {
971 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
972 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
973 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700974 }
Alex Elder849b4262012-07-09 21:04:24 -0500975
Alex Elderbb23e372013-05-06 09:51:29 -0500976 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -0500977
Alex Elderbb23e372013-05-06 09:51:29 -0500978 if (first_time) {
979 header->object_prefix = object_prefix;
980 header->obj_order = ondisk->options.order;
981 header->crypt_type = ondisk->options.crypt_type;
982 header->comp_type = ondisk->options.comp_type;
983 /* The rest aren't used for format 1 images */
984 header->stripe_unit = 0;
985 header->stripe_count = 0;
986 header->features = 0;
Alex Elder662518b2013-05-06 09:51:29 -0500987 } else {
988 ceph_put_snap_context(header->snapc);
989 kfree(header->snap_names);
990 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -0500991 }
992
993 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -0500994
Alex Elderf84344f2012-08-31 17:29:51 -0500995 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -0500996 header->snapc = snapc;
997 header->snap_names = snap_names;
998 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -0500999
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001001out_2big:
1002 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001003out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001004 kfree(snap_sizes);
1005 kfree(snap_names);
1006 ceph_put_snap_context(snapc);
1007 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001008
Alex Elderbb23e372013-05-06 09:51:29 -05001009 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001010}
1011
Alex Elder9682fc62013-04-30 00:44:33 -05001012static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1013{
1014 const char *snap_name;
1015
1016 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1017
1018 /* Skip over names until we find the one we are looking for */
1019
1020 snap_name = rbd_dev->header.snap_names;
1021 while (which--)
1022 snap_name += strlen(snap_name) + 1;
1023
1024 return kstrdup(snap_name, GFP_KERNEL);
1025}
1026
Alex Elder30d1cff2013-05-01 12:43:03 -05001027/*
1028 * Snapshot id comparison function for use with qsort()/bsearch().
1029 * Note that result is for snapshots in *descending* order.
1030 */
1031static int snapid_compare_reverse(const void *s1, const void *s2)
1032{
1033 u64 snap_id1 = *(u64 *)s1;
1034 u64 snap_id2 = *(u64 *)s2;
1035
1036 if (snap_id1 < snap_id2)
1037 return 1;
1038 return snap_id1 == snap_id2 ? 0 : -1;
1039}
1040
1041/*
1042 * Search a snapshot context to see if the given snapshot id is
1043 * present.
1044 *
1045 * Returns the position of the snapshot id in the array if it's found,
1046 * or BAD_SNAP_INDEX otherwise.
1047 *
1048 * Note: The snapshot array is in kept sorted (by the osd) in
1049 * reverse order, highest snapshot id first.
1050 */
Alex Elder9682fc62013-04-30 00:44:33 -05001051static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1052{
1053 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001054 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001055
Alex Elder30d1cff2013-05-01 12:43:03 -05001056 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1057 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001058
Alex Elder30d1cff2013-05-01 12:43:03 -05001059 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001060}
1061
Alex Elder2ad3d712013-04-30 00:44:33 -05001062static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1063 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001064{
1065 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001066 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001067
1068 which = rbd_dev_snap_index(rbd_dev, snap_id);
1069 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001070 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001071
Josh Durginda6a6b62013-09-04 17:57:31 -07001072 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1073 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001074}
1075
Alex Elder9e15b772012-10-30 19:40:33 -05001076static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1077{
Alex Elder9e15b772012-10-30 19:40:33 -05001078 if (snap_id == CEPH_NOSNAP)
1079 return RBD_SNAP_HEAD_NAME;
1080
Alex Elder54cac612013-04-30 00:44:33 -05001081 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1082 if (rbd_dev->image_format == 1)
1083 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001084
Alex Elder54cac612013-04-30 00:44:33 -05001085 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001086}
1087
Alex Elder2ad3d712013-04-30 00:44:33 -05001088static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1089 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001090{
Alex Elder2ad3d712013-04-30 00:44:33 -05001091 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1092 if (snap_id == CEPH_NOSNAP) {
1093 *snap_size = rbd_dev->header.image_size;
1094 } else if (rbd_dev->image_format == 1) {
1095 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001096
Alex Elder2ad3d712013-04-30 00:44:33 -05001097 which = rbd_dev_snap_index(rbd_dev, snap_id);
1098 if (which == BAD_SNAP_INDEX)
1099 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001100
Alex Elder2ad3d712013-04-30 00:44:33 -05001101 *snap_size = rbd_dev->header.snap_sizes[which];
1102 } else {
1103 u64 size = 0;
1104 int ret;
1105
1106 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1107 if (ret)
1108 return ret;
1109
1110 *snap_size = size;
1111 }
1112 return 0;
1113}
1114
1115static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1116 u64 *snap_features)
1117{
1118 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1119 if (snap_id == CEPH_NOSNAP) {
1120 *snap_features = rbd_dev->header.features;
1121 } else if (rbd_dev->image_format == 1) {
1122 *snap_features = 0; /* No features for format 1 */
1123 } else {
1124 u64 features = 0;
1125 int ret;
1126
1127 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1128 if (ret)
1129 return ret;
1130
1131 *snap_features = features;
1132 }
1133 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001134}
1135
Alex Elderd1cf5782013-04-27 09:59:30 -05001136static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001138 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001139 u64 size = 0;
1140 u64 features = 0;
1141 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001142
Alex Elder2ad3d712013-04-30 00:44:33 -05001143 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1144 if (ret)
1145 return ret;
1146 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1147 if (ret)
1148 return ret;
1149
1150 rbd_dev->mapping.size = size;
1151 rbd_dev->mapping.features = features;
1152
Alex Elder8b0241f2013-04-25 23:15:08 -05001153 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154}
1155
Alex Elderd1cf5782013-04-27 09:59:30 -05001156static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1157{
1158 rbd_dev->mapping.size = 0;
1159 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001160}
1161
Himangi Saraogi7d5079a2014-07-24 03:17:07 +05301162static void rbd_segment_name_free(const char *name)
1163{
1164 /* The explicit cast here is needed to drop the const qualifier */
1165
1166 kmem_cache_free(rbd_segment_name_cache, (void *)name);
1167}
1168
Alex Elder98571b52013-01-20 14:44:42 -06001169static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170{
Alex Elder65ccfe22012-08-09 10:33:26 -07001171 char *name;
1172 u64 segment;
1173 int ret;
Josh Durgin3a96d5c2013-06-12 19:15:06 -07001174 char *name_format;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001175
Alex Elder78c2a442013-05-01 12:43:04 -05001176 name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -07001177 if (!name)
1178 return NULL;
1179 segment = offset >> rbd_dev->header.obj_order;
Josh Durgin3a96d5c2013-06-12 19:15:06 -07001180 name_format = "%s.%012llx";
1181 if (rbd_dev->image_format == 2)
1182 name_format = "%s.%016llx";
Ilya Dryomov2d0ebc52014-01-27 17:40:18 +02001183 ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
Alex Elder65ccfe22012-08-09 10:33:26 -07001184 rbd_dev->header.object_prefix, segment);
Ilya Dryomov2d0ebc52014-01-27 17:40:18 +02001185 if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
Alex Elder65ccfe22012-08-09 10:33:26 -07001186 pr_err("error formatting segment name for #%llu (%d)\n",
1187 segment, ret);
Himangi Saraogi7d5079a2014-07-24 03:17:07 +05301188 rbd_segment_name_free(name);
Alex Elder65ccfe22012-08-09 10:33:26 -07001189 name = NULL;
1190 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001191
Alex Elder65ccfe22012-08-09 10:33:26 -07001192 return name;
1193}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001194
Alex Elder65ccfe22012-08-09 10:33:26 -07001195static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1196{
1197 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001198
Alex Elder65ccfe22012-08-09 10:33:26 -07001199 return offset & (segment_size - 1);
1200}
1201
1202static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1203 u64 offset, u64 length)
1204{
1205 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1206
1207 offset &= segment_size - 1;
1208
Alex Elderaafb2302012-09-06 16:00:54 -05001209 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001210 if (offset + length > segment_size)
1211 length = segment_size - offset;
1212
1213 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001214}
1215
1216/*
Josh Durgin029bcbd2011-07-22 11:35:23 -07001217 * returns the size of an object in the image
1218 */
1219static u64 rbd_obj_bytes(struct rbd_image_header *header)
1220{
1221 return 1 << header->obj_order;
1222}
1223
1224/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001225 * bio helpers
1226 */
1227
1228static void bio_chain_put(struct bio *chain)
1229{
1230 struct bio *tmp;
1231
1232 while (chain) {
1233 tmp = chain;
1234 chain = chain->bi_next;
1235 bio_put(tmp);
1236 }
1237}
1238
1239/*
1240 * zeros a bio chain, starting at specific offset
1241 */
1242static void zero_bio_chain(struct bio *chain, int start_ofs)
1243{
Kent Overstreet79886132013-11-23 17:19:00 -08001244 struct bio_vec bv;
1245 struct bvec_iter iter;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001246 unsigned long flags;
1247 void *buf;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001248 int pos = 0;
1249
1250 while (chain) {
Kent Overstreet79886132013-11-23 17:19:00 -08001251 bio_for_each_segment(bv, chain, iter) {
1252 if (pos + bv.bv_len > start_ofs) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001253 int remainder = max(start_ofs - pos, 0);
Kent Overstreet79886132013-11-23 17:19:00 -08001254 buf = bvec_kmap_irq(&bv, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255 memset(buf + remainder, 0,
Kent Overstreet79886132013-11-23 17:19:00 -08001256 bv.bv_len - remainder);
1257 flush_dcache_page(bv.bv_page);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +02001258 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001259 }
Kent Overstreet79886132013-11-23 17:19:00 -08001260 pos += bv.bv_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261 }
1262
1263 chain = chain->bi_next;
1264 }
1265}
1266
1267/*
Alex Elderb9434c52013-04-19 15:34:50 -05001268 * similar to zero_bio_chain(), zeros data defined by a page array,
1269 * starting at the given byte offset from the start of the array and
1270 * continuing up to the given end offset. The pages array is
1271 * assumed to be big enough to hold all bytes up to the end.
1272 */
1273static void zero_pages(struct page **pages, u64 offset, u64 end)
1274{
1275 struct page **page = &pages[offset >> PAGE_SHIFT];
1276
1277 rbd_assert(end > offset);
1278 rbd_assert(end - offset <= (u64)SIZE_MAX);
1279 while (offset < end) {
1280 size_t page_offset;
1281 size_t length;
1282 unsigned long flags;
1283 void *kaddr;
1284
Geert Uytterhoeven491205a2013-05-13 20:35:37 -05001285 page_offset = offset & ~PAGE_MASK;
1286 length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
Alex Elderb9434c52013-04-19 15:34:50 -05001287 local_irq_save(flags);
1288 kaddr = kmap_atomic(*page);
1289 memset(kaddr + page_offset, 0, length);
Alex Eldere2156052013-05-22 20:54:25 -05001290 flush_dcache_page(*page);
Alex Elderb9434c52013-04-19 15:34:50 -05001291 kunmap_atomic(kaddr);
1292 local_irq_restore(flags);
1293
1294 offset += length;
1295 page++;
1296 }
1297}
1298
1299/*
Alex Elderf7760da2012-10-20 22:17:27 -05001300 * Clone a portion of a bio, starting at the given byte offset
1301 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001302 */
Alex Elderf7760da2012-10-20 22:17:27 -05001303static struct bio *bio_clone_range(struct bio *bio_src,
1304 unsigned int offset,
1305 unsigned int len,
1306 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001307{
Alex Elderf7760da2012-10-20 22:17:27 -05001308 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001309
Kent Overstreet5341a6272013-08-07 14:31:11 -07001310 bio = bio_clone(bio_src, gfpmask);
Alex Elderf7760da2012-10-20 22:17:27 -05001311 if (!bio)
1312 return NULL; /* ENOMEM */
1313
Kent Overstreet5341a6272013-08-07 14:31:11 -07001314 bio_advance(bio, offset);
Kent Overstreet4f024f32013-10-11 15:44:27 -07001315 bio->bi_iter.bi_size = len;
Alex Elder542582f2012-08-09 10:33:25 -07001316
Alex Elderf7760da2012-10-20 22:17:27 -05001317 return bio;
1318}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001319
Alex Elderf7760da2012-10-20 22:17:27 -05001320/*
1321 * Clone a portion of a bio chain, starting at the given byte offset
1322 * into the first bio in the source chain and continuing for the
1323 * number of bytes indicated. The result is another bio chain of
1324 * exactly the given length, or a null pointer on error.
1325 *
1326 * The bio_src and offset parameters are both in-out. On entry they
1327 * refer to the first source bio and the offset into that bio where
1328 * the start of data to be cloned is located.
1329 *
1330 * On return, bio_src is updated to refer to the bio in the source
1331 * chain that contains first un-cloned byte, and *offset will
1332 * contain the offset of that byte within that bio.
1333 */
1334static struct bio *bio_chain_clone_range(struct bio **bio_src,
1335 unsigned int *offset,
1336 unsigned int len,
1337 gfp_t gfpmask)
1338{
1339 struct bio *bi = *bio_src;
1340 unsigned int off = *offset;
1341 struct bio *chain = NULL;
1342 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001343
Alex Elderf7760da2012-10-20 22:17:27 -05001344 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001345
Kent Overstreet4f024f32013-10-11 15:44:27 -07001346 if (!bi || off >= bi->bi_iter.bi_size || !len)
Alex Elderf7760da2012-10-20 22:17:27 -05001347 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001348
Alex Elderf7760da2012-10-20 22:17:27 -05001349 end = &chain;
1350 while (len) {
1351 unsigned int bi_size;
1352 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001353
Alex Elderf5400b72012-11-01 10:17:15 -05001354 if (!bi) {
1355 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001356 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001357 }
Kent Overstreet4f024f32013-10-11 15:44:27 -07001358 bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
Alex Elderf7760da2012-10-20 22:17:27 -05001359 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1360 if (!bio)
1361 goto out_err; /* ENOMEM */
1362
1363 *end = bio;
1364 end = &bio->bi_next;
1365
1366 off += bi_size;
Kent Overstreet4f024f32013-10-11 15:44:27 -07001367 if (off == bi->bi_iter.bi_size) {
Alex Elderf7760da2012-10-20 22:17:27 -05001368 bi = bi->bi_next;
1369 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001370 }
Alex Elderf7760da2012-10-20 22:17:27 -05001371 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001372 }
Alex Elderf7760da2012-10-20 22:17:27 -05001373 *bio_src = bi;
1374 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001375
Alex Elderf7760da2012-10-20 22:17:27 -05001376 return chain;
1377out_err:
1378 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001379
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001380 return NULL;
1381}
1382
Alex Elder926f9b32013-02-11 12:33:24 -06001383/*
1384 * The default/initial value for all object request flags is 0. For
1385 * each flag, once its value is set to 1 it is never reset to 0
1386 * again.
1387 */
Alex Elder6365d332013-02-11 12:33:24 -06001388static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1389{
1390 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001391 struct rbd_device *rbd_dev;
1392
Alex Elder57acbaa2013-02-11 12:33:24 -06001393 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001394 rbd_warn(rbd_dev, "obj_request %p already marked img_data",
Alex Elder6365d332013-02-11 12:33:24 -06001395 obj_request);
1396 }
1397}
1398
1399static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1400{
1401 smp_mb();
1402 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1403}
1404
Alex Elder57acbaa2013-02-11 12:33:24 -06001405static void obj_request_done_set(struct rbd_obj_request *obj_request)
1406{
1407 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1408 struct rbd_device *rbd_dev = NULL;
1409
1410 if (obj_request_img_data_test(obj_request))
1411 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001412 rbd_warn(rbd_dev, "obj_request %p already marked done",
Alex Elder57acbaa2013-02-11 12:33:24 -06001413 obj_request);
1414 }
1415}
1416
1417static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1418{
1419 smp_mb();
1420 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1421}
1422
Alex Elder5679c592013-02-11 12:33:24 -06001423/*
1424 * This sets the KNOWN flag after (possibly) setting the EXISTS
1425 * flag. The latter is set based on the "exists" value provided.
1426 *
1427 * Note that for our purposes once an object exists it never goes
1428 * away again. It's possible that the response from two existence
1429 * checks are separated by the creation of the target object, and
1430 * the first ("doesn't exist") response arrives *after* the second
1431 * ("does exist"). In that case we ignore the second one.
1432 */
1433static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1434 bool exists)
1435{
1436 if (exists)
1437 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1438 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1439 smp_mb();
1440}
1441
1442static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1443{
1444 smp_mb();
1445 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1446}
1447
1448static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1449{
1450 smp_mb();
1451 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1452}
1453
Ilya Dryomov96385562014-06-10 13:53:29 +04001454static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1455{
1456 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1457
1458 return obj_request->img_offset <
1459 round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1460}
1461
Alex Elderbf0d5f502012-11-22 00:00:08 -06001462static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1463{
Alex Elder37206ee2013-02-20 17:32:08 -06001464 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1465 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001466 kref_get(&obj_request->kref);
1467}
1468
1469static void rbd_obj_request_destroy(struct kref *kref);
1470static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1471{
1472 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001473 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1474 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001475 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1476}
1477
Alex Elder0f2d5be2014-04-26 14:21:44 +04001478static void rbd_img_request_get(struct rbd_img_request *img_request)
1479{
1480 dout("%s: img %p (was %d)\n", __func__, img_request,
1481 atomic_read(&img_request->kref.refcount));
1482 kref_get(&img_request->kref);
1483}
1484
Alex Eldere93f3152013-05-08 22:50:04 -05001485static bool img_request_child_test(struct rbd_img_request *img_request);
1486static void rbd_parent_request_destroy(struct kref *kref);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001487static void rbd_img_request_destroy(struct kref *kref);
1488static void rbd_img_request_put(struct rbd_img_request *img_request)
1489{
1490 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001491 dout("%s: img %p (was %d)\n", __func__, img_request,
1492 atomic_read(&img_request->kref.refcount));
Alex Eldere93f3152013-05-08 22:50:04 -05001493 if (img_request_child_test(img_request))
1494 kref_put(&img_request->kref, rbd_parent_request_destroy);
1495 else
1496 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001497}
1498
1499static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1500 struct rbd_obj_request *obj_request)
1501{
Alex Elder25dcf952013-01-25 17:08:55 -06001502 rbd_assert(obj_request->img_request == NULL);
1503
Alex Elderb155e862013-04-15 14:50:37 -05001504 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001505 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001506 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001507 rbd_assert(!obj_request_img_data_test(obj_request));
1508 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001509 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001510 img_request->obj_request_count++;
1511 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001512 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1513 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001514}
1515
1516static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1517 struct rbd_obj_request *obj_request)
1518{
1519 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001520
Alex Elder37206ee2013-02-20 17:32:08 -06001521 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1522 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001523 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001524 rbd_assert(img_request->obj_request_count > 0);
1525 img_request->obj_request_count--;
1526 rbd_assert(obj_request->which == img_request->obj_request_count);
1527 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001528 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001529 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001530 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001531 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001532 rbd_obj_request_put(obj_request);
1533}
1534
1535static bool obj_request_type_valid(enum obj_request_type type)
1536{
1537 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001538 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001539 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001540 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001541 return true;
1542 default:
1543 return false;
1544 }
1545}
1546
Alex Elderbf0d5f502012-11-22 00:00:08 -06001547static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1548 struct rbd_obj_request *obj_request)
1549{
Ilya Dryomov71c20a02014-06-19 11:38:14 +04001550 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001551 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1552}
1553
Ilya Dryomov71c20a02014-06-19 11:38:14 +04001554static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
1555{
1556 dout("%s %p\n", __func__, obj_request);
1557 ceph_osdc_cancel_request(obj_request->osd_req);
1558}
1559
1560/*
1561 * Wait for an object request to complete. If interrupted, cancel the
1562 * underlying osd request.
1563 */
1564static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1565{
1566 int ret;
1567
1568 dout("%s %p\n", __func__, obj_request);
1569
1570 ret = wait_for_completion_interruptible(&obj_request->completion);
1571 if (ret < 0) {
1572 dout("%s %p interrupted\n", __func__, obj_request);
1573 rbd_obj_request_end(obj_request);
1574 return ret;
1575 }
1576
1577 dout("%s %p done\n", __func__, obj_request);
1578 return 0;
1579}
1580
Alex Elderbf0d5f502012-11-22 00:00:08 -06001581static void rbd_img_request_complete(struct rbd_img_request *img_request)
1582{
Alex Elder55f27e02013-04-10 12:34:25 -05001583
Alex Elder37206ee2013-02-20 17:32:08 -06001584 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001585
1586 /*
1587 * If no error occurred, compute the aggregate transfer
1588 * count for the image request. We could instead use
1589 * atomic64_cmpxchg() to update it as each object request
1590 * completes; not clear which way is better off hand.
1591 */
1592 if (!img_request->result) {
1593 struct rbd_obj_request *obj_request;
1594 u64 xferred = 0;
1595
1596 for_each_obj_request(img_request, obj_request)
1597 xferred += obj_request->xferred;
1598 img_request->xferred = xferred;
1599 }
1600
Alex Elderbf0d5f502012-11-22 00:00:08 -06001601 if (img_request->callback)
1602 img_request->callback(img_request);
1603 else
1604 rbd_img_request_put(img_request);
1605}
1606
Alex Elder0c425242013-02-08 09:55:49 -06001607/*
1608 * The default/initial value for all image request flags is 0. Each
1609 * is conditionally set to 1 at image request initialization time
1610 * and currently never change thereafter.
1611 */
1612static void img_request_write_set(struct rbd_img_request *img_request)
1613{
1614 set_bit(IMG_REQ_WRITE, &img_request->flags);
1615 smp_mb();
1616}
1617
1618static bool img_request_write_test(struct rbd_img_request *img_request)
1619{
1620 smp_mb();
1621 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1622}
1623
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001624/*
1625 * Set the discard flag when the img_request is an discard request
1626 */
1627static void img_request_discard_set(struct rbd_img_request *img_request)
1628{
1629 set_bit(IMG_REQ_DISCARD, &img_request->flags);
1630 smp_mb();
1631}
1632
1633static bool img_request_discard_test(struct rbd_img_request *img_request)
1634{
1635 smp_mb();
1636 return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
1637}
1638
Alex Elder9849e982013-01-24 16:13:36 -06001639static void img_request_child_set(struct rbd_img_request *img_request)
1640{
1641 set_bit(IMG_REQ_CHILD, &img_request->flags);
1642 smp_mb();
1643}
1644
Alex Eldere93f3152013-05-08 22:50:04 -05001645static void img_request_child_clear(struct rbd_img_request *img_request)
1646{
1647 clear_bit(IMG_REQ_CHILD, &img_request->flags);
1648 smp_mb();
1649}
1650
Alex Elder9849e982013-01-24 16:13:36 -06001651static bool img_request_child_test(struct rbd_img_request *img_request)
1652{
1653 smp_mb();
1654 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1655}
1656
Alex Elderd0b2e942013-01-24 16:13:36 -06001657static void img_request_layered_set(struct rbd_img_request *img_request)
1658{
1659 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1660 smp_mb();
1661}
1662
Alex Eldera2acd002013-05-08 22:50:04 -05001663static void img_request_layered_clear(struct rbd_img_request *img_request)
1664{
1665 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1666 smp_mb();
1667}
1668
Alex Elderd0b2e942013-01-24 16:13:36 -06001669static bool img_request_layered_test(struct rbd_img_request *img_request)
1670{
1671 smp_mb();
1672 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1673}
1674
Alex Elder6e2a4502013-03-27 09:16:30 -05001675static void
1676rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1677{
Alex Elderb9434c52013-04-19 15:34:50 -05001678 u64 xferred = obj_request->xferred;
1679 u64 length = obj_request->length;
1680
Alex Elder6e2a4502013-03-27 09:16:30 -05001681 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1682 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001683 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001684 /*
Josh Durgin17c1cc12013-08-26 17:55:38 -07001685 * ENOENT means a hole in the image. We zero-fill the entire
1686 * length of the request. A short read also implies zero-fill
1687 * to the end of the request. An error requires the whole
1688 * length of the request to be reported finished with an error
1689 * to the block layer. In each case we update the xferred
1690 * count to indicate the whole request was satisfied.
Alex Elder6e2a4502013-03-27 09:16:30 -05001691 */
Alex Elderb9434c52013-04-19 15:34:50 -05001692 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001693 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001694 if (obj_request->type == OBJ_REQUEST_BIO)
1695 zero_bio_chain(obj_request->bio_list, 0);
1696 else
1697 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001698 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001699 } else if (xferred < length && !obj_request->result) {
1700 if (obj_request->type == OBJ_REQUEST_BIO)
1701 zero_bio_chain(obj_request->bio_list, xferred);
1702 else
1703 zero_pages(obj_request->pages, xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001704 }
Josh Durgin17c1cc12013-08-26 17:55:38 -07001705 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001706 obj_request_done_set(obj_request);
1707}
1708
Alex Elderbf0d5f502012-11-22 00:00:08 -06001709static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1710{
Alex Elder37206ee2013-02-20 17:32:08 -06001711 dout("%s: obj %p cb %p\n", __func__, obj_request,
1712 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001713 if (obj_request->callback)
1714 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001715 else
1716 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001717}
1718
Alex Elderc47f9372013-02-26 14:23:07 -06001719static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001720{
1721 dout("%s: obj %p\n", __func__, obj_request);
1722 obj_request_done_set(obj_request);
1723}
1724
Alex Elderc47f9372013-02-26 14:23:07 -06001725static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001726{
Alex Elder57acbaa2013-02-11 12:33:24 -06001727 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001728 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001729 bool layered = false;
1730
1731 if (obj_request_img_data_test(obj_request)) {
1732 img_request = obj_request->img_request;
1733 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001734 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001735 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001736
1737 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1738 obj_request, img_request, obj_request->result,
1739 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001740 if (layered && obj_request->result == -ENOENT &&
1741 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001742 rbd_img_parent_read(obj_request);
1743 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001744 rbd_img_obj_request_read_callback(obj_request);
1745 else
1746 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001747}
1748
Alex Elderc47f9372013-02-26 14:23:07 -06001749static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001750{
Sage Weil1b83bef2013-02-25 16:11:12 -08001751 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1752 obj_request->result, obj_request->length);
1753 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001754 * There is no such thing as a successful short write. Set
1755 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001756 */
1757 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001758 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001759}
1760
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001761static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
1762{
1763 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1764 obj_request->result, obj_request->length);
1765 /*
1766 * There is no such thing as a successful short discard. Set
1767 * it to our originally-requested length.
1768 */
1769 obj_request->xferred = obj_request->length;
1770 obj_request_done_set(obj_request);
1771}
1772
Alex Elderfbfab532013-02-08 09:55:48 -06001773/*
1774 * For a simple stat call there's nothing to do. We'll do more if
1775 * this is part of a write sequence for a layered image.
1776 */
Alex Elderc47f9372013-02-26 14:23:07 -06001777static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001778{
Alex Elder37206ee2013-02-20 17:32:08 -06001779 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001780 obj_request_done_set(obj_request);
1781}
1782
Alex Elderbf0d5f502012-11-22 00:00:08 -06001783static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1784 struct ceph_msg *msg)
1785{
1786 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001787 u16 opcode;
1788
Alex Elder37206ee2013-02-20 17:32:08 -06001789 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001790 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001791 if (obj_request_img_data_test(obj_request)) {
1792 rbd_assert(obj_request->img_request);
1793 rbd_assert(obj_request->which != BAD_WHICH);
1794 } else {
1795 rbd_assert(obj_request->which == BAD_WHICH);
1796 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001797
Sage Weil1b83bef2013-02-25 16:11:12 -08001798 if (osd_req->r_result < 0)
1799 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001800
Ilya Dryomov7cc69d42014-02-25 16:22:27 +02001801 rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001802
Alex Elderc47f9372013-02-26 14:23:07 -06001803 /*
1804 * We support a 64-bit length, but ultimately it has to be
1805 * passed to blk_end_request(), which takes an unsigned int.
1806 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001807 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elder8b3e1a52013-01-24 16:13:36 -06001808 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001809
Alex Elder79528732013-04-03 21:32:51 -05001810 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001811 switch (opcode) {
1812 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001813 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001814 break;
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001815 case CEPH_OSD_OP_SETALLOCHINT:
1816 rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
1817 /* fall through */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001818 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001819 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001820 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001821 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001822 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001823 break;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001824 case CEPH_OSD_OP_DELETE:
1825 case CEPH_OSD_OP_TRUNCATE:
1826 case CEPH_OSD_OP_ZERO:
1827 rbd_osd_discard_callback(obj_request);
1828 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001829 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001830 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001831 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001832 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001833 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001834 default:
Ilya Dryomov9584d502014-07-11 12:11:20 +04001835 rbd_warn(NULL, "%s: unsupported op %hu",
Alex Elderbf0d5f502012-11-22 00:00:08 -06001836 obj_request->object_name, (unsigned short) opcode);
1837 break;
1838 }
1839
Alex Elder07741302013-02-05 23:41:50 -06001840 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001841 rbd_obj_request_complete(obj_request);
1842}
1843
Alex Elder9d4df012013-04-19 15:34:50 -05001844static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001845{
1846 struct rbd_img_request *img_request = obj_request->img_request;
Alex Elder8c042b02013-04-03 01:28:58 -05001847 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001848 u64 snap_id;
Alex Elder430c28c2013-04-03 21:32:51 -05001849
Alex Elder8c042b02013-04-03 01:28:58 -05001850 rbd_assert(osd_req != NULL);
Alex Elder430c28c2013-04-03 21:32:51 -05001851
Alex Elder9d4df012013-04-19 15:34:50 -05001852 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
Alex Elder8c042b02013-04-03 01:28:58 -05001853 ceph_osdc_build_request(osd_req, obj_request->offset,
Alex Elder9d4df012013-04-19 15:34:50 -05001854 NULL, snap_id, NULL);
1855}
1856
1857static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1858{
1859 struct rbd_img_request *img_request = obj_request->img_request;
1860 struct ceph_osd_request *osd_req = obj_request->osd_req;
1861 struct ceph_snap_context *snapc;
1862 struct timespec mtime = CURRENT_TIME;
1863
1864 rbd_assert(osd_req != NULL);
1865
1866 snapc = img_request ? img_request->snapc : NULL;
1867 ceph_osdc_build_request(osd_req, obj_request->offset,
1868 snapc, CEPH_NOSNAP, &mtime);
Alex Elder430c28c2013-04-03 21:32:51 -05001869}
1870
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001871/*
1872 * Create an osd request. A read request has one osd op (read).
1873 * A write request has either one (watch) or two (hint+write) osd ops.
1874 * (All rbd data writes are prefixed with an allocation hint op, but
1875 * technically osd watch is a write request, hence this distinction.)
1876 */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001877static struct ceph_osd_request *rbd_osd_req_create(
1878 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001879 enum obj_operation_type op_type,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02001880 unsigned int num_ops,
Alex Elder430c28c2013-04-03 21:32:51 -05001881 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001882{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001883 struct ceph_snap_context *snapc = NULL;
1884 struct ceph_osd_client *osdc;
1885 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001886
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001887 if (obj_request_img_data_test(obj_request) &&
1888 (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
Alex Elder6365d332013-02-11 12:33:24 -06001889 struct rbd_img_request *img_request = obj_request->img_request;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001890 if (op_type == OBJ_OP_WRITE) {
1891 rbd_assert(img_request_write_test(img_request));
1892 } else {
1893 rbd_assert(img_request_discard_test(img_request));
1894 }
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001895 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001896 }
1897
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001898 rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02001899
1900 /* Allocate and initialize the request, for the num_ops ops */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001901
1902 osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02001903 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1904 GFP_ATOMIC);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001905 if (!osd_req)
1906 return NULL; /* ENOMEM */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001907
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001908 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001909 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05001910 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06001911 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001912
1913 osd_req->r_callback = rbd_osd_req_callback;
1914 osd_req->r_priv = obj_request;
1915
Ilya Dryomov3c972c92014-01-27 17:40:20 +02001916 osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
1917 ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001918
Alex Elderbf0d5f502012-11-22 00:00:08 -06001919 return osd_req;
1920}
1921
Alex Elder0eefd472013-04-19 15:34:50 -05001922/*
1923 * Create a copyup osd request based on the information in the
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001924 * object request supplied. A copyup request has three osd ops,
1925 * a copyup method call, a hint op, and a write op.
Alex Elder0eefd472013-04-19 15:34:50 -05001926 */
1927static struct ceph_osd_request *
1928rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1929{
1930 struct rbd_img_request *img_request;
1931 struct ceph_snap_context *snapc;
1932 struct rbd_device *rbd_dev;
1933 struct ceph_osd_client *osdc;
1934 struct ceph_osd_request *osd_req;
1935
1936 rbd_assert(obj_request_img_data_test(obj_request));
1937 img_request = obj_request->img_request;
1938 rbd_assert(img_request);
1939 rbd_assert(img_request_write_test(img_request));
1940
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001941 /* Allocate and initialize the request, for the three ops */
Alex Elder0eefd472013-04-19 15:34:50 -05001942
1943 snapc = img_request->snapc;
1944 rbd_dev = img_request->rbd_dev;
1945 osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001946 osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
Alex Elder0eefd472013-04-19 15:34:50 -05001947 if (!osd_req)
1948 return NULL; /* ENOMEM */
1949
1950 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1951 osd_req->r_callback = rbd_osd_req_callback;
1952 osd_req->r_priv = obj_request;
1953
Ilya Dryomov3c972c92014-01-27 17:40:20 +02001954 osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
1955 ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
Alex Elder0eefd472013-04-19 15:34:50 -05001956
Alex Elder0eefd472013-04-19 15:34:50 -05001957 return osd_req;
1958}
1959
1960
Alex Elderbf0d5f502012-11-22 00:00:08 -06001961static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1962{
1963 ceph_osdc_put_request(osd_req);
1964}
1965
1966/* object_name is assumed to be a non-null pointer and NUL-terminated */
1967
1968static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1969 u64 offset, u64 length,
1970 enum obj_request_type type)
1971{
1972 struct rbd_obj_request *obj_request;
1973 size_t size;
1974 char *name;
1975
1976 rbd_assert(obj_request_type_valid(type));
1977
1978 size = strlen(object_name) + 1;
Alex Elderf907ad52013-05-01 12:43:03 -05001979 name = kmalloc(size, GFP_KERNEL);
1980 if (!name)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001981 return NULL;
1982
Alex Elder868311b2013-05-01 12:43:03 -05001983 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
Alex Elderf907ad52013-05-01 12:43:03 -05001984 if (!obj_request) {
1985 kfree(name);
1986 return NULL;
1987 }
1988
Alex Elderbf0d5f502012-11-22 00:00:08 -06001989 obj_request->object_name = memcpy(name, object_name, size);
1990 obj_request->offset = offset;
1991 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06001992 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001993 obj_request->which = BAD_WHICH;
1994 obj_request->type = type;
1995 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06001996 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001997 kref_init(&obj_request->kref);
1998
Alex Elder37206ee2013-02-20 17:32:08 -06001999 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
2000 offset, length, (int)type, obj_request);
2001
Alex Elderbf0d5f502012-11-22 00:00:08 -06002002 return obj_request;
2003}
2004
2005static void rbd_obj_request_destroy(struct kref *kref)
2006{
2007 struct rbd_obj_request *obj_request;
2008
2009 obj_request = container_of(kref, struct rbd_obj_request, kref);
2010
Alex Elder37206ee2013-02-20 17:32:08 -06002011 dout("%s: obj %p\n", __func__, obj_request);
2012
Alex Elderbf0d5f502012-11-22 00:00:08 -06002013 rbd_assert(obj_request->img_request == NULL);
2014 rbd_assert(obj_request->which == BAD_WHICH);
2015
2016 if (obj_request->osd_req)
2017 rbd_osd_req_destroy(obj_request->osd_req);
2018
2019 rbd_assert(obj_request_type_valid(obj_request->type));
2020 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06002021 case OBJ_REQUEST_NODATA:
2022 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06002023 case OBJ_REQUEST_BIO:
2024 if (obj_request->bio_list)
2025 bio_chain_put(obj_request->bio_list);
2026 break;
Alex Elder788e2df2013-01-17 12:25:27 -06002027 case OBJ_REQUEST_PAGES:
2028 if (obj_request->pages)
2029 ceph_release_page_vector(obj_request->pages,
2030 obj_request->page_count);
2031 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002032 }
2033
Alex Elderf907ad52013-05-01 12:43:03 -05002034 kfree(obj_request->object_name);
Alex Elder868311b2013-05-01 12:43:03 -05002035 obj_request->object_name = NULL;
2036 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002037}
2038
Alex Elderfb65d2282013-05-08 22:50:04 -05002039/* It's OK to call this for a device with no parent */
2040
2041static void rbd_spec_put(struct rbd_spec *spec);
2042static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2043{
2044 rbd_dev_remove_parent(rbd_dev);
2045 rbd_spec_put(rbd_dev->parent_spec);
2046 rbd_dev->parent_spec = NULL;
2047 rbd_dev->parent_overlap = 0;
2048}
2049
Alex Elderbf0d5f502012-11-22 00:00:08 -06002050/*
Alex Eldera2acd002013-05-08 22:50:04 -05002051 * Parent image reference counting is used to determine when an
2052 * image's parent fields can be safely torn down--after there are no
2053 * more in-flight requests to the parent image. When the last
2054 * reference is dropped, cleaning them up is safe.
2055 */
2056static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2057{
2058 int counter;
2059
2060 if (!rbd_dev->parent_spec)
2061 return;
2062
2063 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2064 if (counter > 0)
2065 return;
2066
2067 /* Last reference; clean up parent data structures */
2068
2069 if (!counter)
2070 rbd_dev_unparent(rbd_dev);
2071 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04002072 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05002073}
2074
2075/*
2076 * If an image has a non-zero parent overlap, get a reference to its
2077 * parent.
2078 *
Alex Elder392a9da2013-05-06 17:40:33 -05002079 * We must get the reference before checking for the overlap to
2080 * coordinate properly with zeroing the parent overlap in
2081 * rbd_dev_v2_parent_info() when an image gets flattened. We
2082 * drop it again if there is no overlap.
2083 *
Alex Eldera2acd002013-05-08 22:50:04 -05002084 * Returns true if the rbd device has a parent with a non-zero
2085 * overlap and a reference for it was successfully taken, or
2086 * false otherwise.
2087 */
2088static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2089{
2090 int counter;
2091
2092 if (!rbd_dev->parent_spec)
2093 return false;
2094
2095 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2096 if (counter > 0 && rbd_dev->parent_overlap)
2097 return true;
2098
2099 /* Image was flattened, but parent is not yet torn down */
2100
2101 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04002102 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05002103
2104 return false;
2105}
2106
Alex Elderbf0d5f502012-11-22 00:00:08 -06002107/*
2108 * Caller is responsible for filling in the list of object requests
2109 * that comprises the image request, and the Linux request pointer
2110 * (if there is one).
2111 */
Alex Eldercc344fa2013-02-19 12:25:56 -06002112static struct rbd_img_request *rbd_img_request_create(
2113 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06002114 u64 offset, u64 length,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002115 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07002116 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002117{
2118 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002119
Ilya Dryomov7a716aa2014-08-05 11:25:54 +04002120 img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002121 if (!img_request)
2122 return NULL;
2123
Alex Elderbf0d5f502012-11-22 00:00:08 -06002124 img_request->rq = NULL;
2125 img_request->rbd_dev = rbd_dev;
2126 img_request->offset = offset;
2127 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06002128 img_request->flags = 0;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002129 if (op_type == OBJ_OP_DISCARD) {
2130 img_request_discard_set(img_request);
2131 img_request->snapc = snapc;
2132 } else if (op_type == OBJ_OP_WRITE) {
Alex Elder0c425242013-02-08 09:55:49 -06002133 img_request_write_set(img_request);
Josh Durgin4e752f02014-04-08 11:12:11 -07002134 img_request->snapc = snapc;
Alex Elder0c425242013-02-08 09:55:49 -06002135 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002136 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06002137 }
Alex Eldera2acd002013-05-08 22:50:04 -05002138 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06002139 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002140 spin_lock_init(&img_request->completion_lock);
2141 img_request->next_completion = 0;
2142 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06002143 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002144 img_request->obj_request_count = 0;
2145 INIT_LIST_HEAD(&img_request->obj_requests);
2146 kref_init(&img_request->kref);
2147
Alex Elder37206ee2013-02-20 17:32:08 -06002148 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002149 obj_op_name(op_type), offset, length, img_request);
Alex Elder37206ee2013-02-20 17:32:08 -06002150
Alex Elderbf0d5f502012-11-22 00:00:08 -06002151 return img_request;
2152}
2153
2154static void rbd_img_request_destroy(struct kref *kref)
2155{
2156 struct rbd_img_request *img_request;
2157 struct rbd_obj_request *obj_request;
2158 struct rbd_obj_request *next_obj_request;
2159
2160 img_request = container_of(kref, struct rbd_img_request, kref);
2161
Alex Elder37206ee2013-02-20 17:32:08 -06002162 dout("%s: img %p\n", __func__, img_request);
2163
Alex Elderbf0d5f502012-11-22 00:00:08 -06002164 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2165 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06002166 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002167
Alex Eldera2acd002013-05-08 22:50:04 -05002168 if (img_request_layered_test(img_request)) {
2169 img_request_layered_clear(img_request);
2170 rbd_dev_parent_put(img_request->rbd_dev);
2171 }
2172
Josh Durginbef95452014-04-04 17:47:52 -07002173 if (img_request_write_test(img_request) ||
2174 img_request_discard_test(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05002175 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002176
Alex Elder1c2a9df2013-05-01 12:43:03 -05002177 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002178}
2179
Alex Eldere93f3152013-05-08 22:50:04 -05002180static struct rbd_img_request *rbd_parent_request_create(
2181 struct rbd_obj_request *obj_request,
2182 u64 img_offset, u64 length)
2183{
2184 struct rbd_img_request *parent_request;
2185 struct rbd_device *rbd_dev;
2186
2187 rbd_assert(obj_request->img_request);
2188 rbd_dev = obj_request->img_request->rbd_dev;
2189
Josh Durgin4e752f02014-04-08 11:12:11 -07002190 parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002191 length, OBJ_OP_READ, NULL);
Alex Eldere93f3152013-05-08 22:50:04 -05002192 if (!parent_request)
2193 return NULL;
2194
2195 img_request_child_set(parent_request);
2196 rbd_obj_request_get(obj_request);
2197 parent_request->obj_request = obj_request;
2198
2199 return parent_request;
2200}
2201
2202static void rbd_parent_request_destroy(struct kref *kref)
2203{
2204 struct rbd_img_request *parent_request;
2205 struct rbd_obj_request *orig_request;
2206
2207 parent_request = container_of(kref, struct rbd_img_request, kref);
2208 orig_request = parent_request->obj_request;
2209
2210 parent_request->obj_request = NULL;
2211 rbd_obj_request_put(orig_request);
2212 img_request_child_clear(parent_request);
2213
2214 rbd_img_request_destroy(kref);
2215}
2216
Alex Elder12178572013-02-08 09:55:49 -06002217static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2218{
Alex Elder6365d332013-02-11 12:33:24 -06002219 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06002220 unsigned int xferred;
2221 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002222 bool more;
Alex Elder12178572013-02-08 09:55:49 -06002223
Alex Elder6365d332013-02-11 12:33:24 -06002224 rbd_assert(obj_request_img_data_test(obj_request));
2225 img_request = obj_request->img_request;
2226
Alex Elder12178572013-02-08 09:55:49 -06002227 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2228 xferred = (unsigned int)obj_request->xferred;
2229 result = obj_request->result;
2230 if (result) {
2231 struct rbd_device *rbd_dev = img_request->rbd_dev;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002232 enum obj_operation_type op_type;
2233
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002234 if (img_request_discard_test(img_request))
2235 op_type = OBJ_OP_DISCARD;
2236 else if (img_request_write_test(img_request))
2237 op_type = OBJ_OP_WRITE;
2238 else
2239 op_type = OBJ_OP_READ;
Alex Elder12178572013-02-08 09:55:49 -06002240
Ilya Dryomov9584d502014-07-11 12:11:20 +04002241 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002242 obj_op_name(op_type), obj_request->length,
2243 obj_request->img_offset, obj_request->offset);
Ilya Dryomov9584d502014-07-11 12:11:20 +04002244 rbd_warn(rbd_dev, " result %d xferred %x",
Alex Elder12178572013-02-08 09:55:49 -06002245 result, xferred);
2246 if (!img_request->result)
2247 img_request->result = result;
2248 }
2249
Alex Elderf1a47392013-04-19 15:34:50 -05002250 /* Image object requests don't own their page array */
2251
2252 if (obj_request->type == OBJ_REQUEST_PAGES) {
2253 obj_request->pages = NULL;
2254 obj_request->page_count = 0;
2255 }
2256
Alex Elder8b3e1a52013-01-24 16:13:36 -06002257 if (img_request_child_test(img_request)) {
2258 rbd_assert(img_request->obj_request != NULL);
2259 more = obj_request->which < img_request->obj_request_count - 1;
2260 } else {
2261 rbd_assert(img_request->rq != NULL);
2262 more = blk_end_request(img_request->rq, result, xferred);
2263 }
2264
2265 return more;
Alex Elder12178572013-02-08 09:55:49 -06002266}
2267
Alex Elder21692382013-04-05 01:27:12 -05002268static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2269{
2270 struct rbd_img_request *img_request;
2271 u32 which = obj_request->which;
2272 bool more = true;
2273
Alex Elder6365d332013-02-11 12:33:24 -06002274 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05002275 img_request = obj_request->img_request;
2276
2277 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2278 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05002279 rbd_assert(img_request->obj_request_count > 0);
2280 rbd_assert(which != BAD_WHICH);
2281 rbd_assert(which < img_request->obj_request_count);
Alex Elder21692382013-04-05 01:27:12 -05002282
2283 spin_lock_irq(&img_request->completion_lock);
2284 if (which != img_request->next_completion)
2285 goto out;
2286
2287 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05002288 rbd_assert(more);
2289 rbd_assert(which < img_request->obj_request_count);
2290
2291 if (!obj_request_done_test(obj_request))
2292 break;
Alex Elder12178572013-02-08 09:55:49 -06002293 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002294 which++;
2295 }
2296
2297 rbd_assert(more ^ (which == img_request->obj_request_count));
2298 img_request->next_completion = which;
2299out:
2300 spin_unlock_irq(&img_request->completion_lock);
Alex Elder0f2d5be2014-04-26 14:21:44 +04002301 rbd_img_request_put(img_request);
Alex Elder21692382013-04-05 01:27:12 -05002302
2303 if (!more)
2304 rbd_img_request_complete(img_request);
2305}
2306
Alex Elderf1a47392013-04-19 15:34:50 -05002307/*
2308 * Split up an image request into one or more object requests, each
2309 * to a different object. The "type" parameter indicates whether
2310 * "data_desc" is the pointer to the head of a list of bio
2311 * structures, or the base of a page array. In either case this
2312 * function assumes data_desc describes memory sufficient to hold
2313 * all data described by the image request.
2314 */
2315static int rbd_img_request_fill(struct rbd_img_request *img_request,
2316 enum obj_request_type type,
2317 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002318{
2319 struct rbd_device *rbd_dev = img_request->rbd_dev;
2320 struct rbd_obj_request *obj_request = NULL;
2321 struct rbd_obj_request *next_obj_request;
Jingoo Hana1580732013-08-09 13:04:35 +09002322 struct bio *bio_list = NULL;
Alex Elderf1a47392013-04-19 15:34:50 -05002323 unsigned int bio_offset = 0;
Jingoo Hana1580732013-08-09 13:04:35 +09002324 struct page **pages = NULL;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002325 enum obj_operation_type op_type;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002326 u64 object_size = rbd_obj_bytes(&rbd_dev->header);
Alex Elder7da22d22013-01-24 16:13:36 -06002327 u64 img_offset;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002328 u64 img_end;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002329 u64 resid;
2330 u16 opcode;
2331
Alex Elderf1a47392013-04-19 15:34:50 -05002332 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2333 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06002334
Alex Elder7da22d22013-01-24 16:13:36 -06002335 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002336 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06002337 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05002338
2339 if (type == OBJ_REQUEST_BIO) {
2340 bio_list = data_desc;
Kent Overstreet4f024f32013-10-11 15:44:27 -07002341 rbd_assert(img_offset ==
2342 bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002343 } else if (type == OBJ_REQUEST_PAGES) {
Alex Elderf1a47392013-04-19 15:34:50 -05002344 pages = data_desc;
2345 }
2346
Alex Elderbf0d5f502012-11-22 00:00:08 -06002347 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05002348 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002349 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002350 u64 offset;
2351 u64 length;
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002352 unsigned int which = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002353
Alex Elder7da22d22013-01-24 16:13:36 -06002354 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002355 if (!object_name)
2356 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06002357 offset = rbd_segment_offset(rbd_dev, img_offset);
2358 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002359 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05002360 offset, length, type);
Alex Elder78c2a442013-05-01 12:43:04 -05002361 /* object request has its own copy of the object name */
2362 rbd_segment_name_free(object_name);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002363 if (!obj_request)
2364 goto out_unwind;
Ilya Dryomov62054da2014-03-04 11:57:17 +02002365
Josh Durgin03507db2013-08-27 14:45:46 -07002366 /*
2367 * set obj_request->img_request before creating the
2368 * osd_request so that it gets the right snapc
2369 */
2370 rbd_img_obj_request_add(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002371
Alex Elderf1a47392013-04-19 15:34:50 -05002372 if (type == OBJ_REQUEST_BIO) {
2373 unsigned int clone_size;
2374
2375 rbd_assert(length <= (u64)UINT_MAX);
2376 clone_size = (unsigned int)length;
2377 obj_request->bio_list =
2378 bio_chain_clone_range(&bio_list,
2379 &bio_offset,
2380 clone_size,
2381 GFP_ATOMIC);
2382 if (!obj_request->bio_list)
Ilya Dryomov62054da2014-03-04 11:57:17 +02002383 goto out_unwind;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002384 } else if (type == OBJ_REQUEST_PAGES) {
Alex Elderf1a47392013-04-19 15:34:50 -05002385 unsigned int page_count;
2386
2387 obj_request->pages = pages;
2388 page_count = (u32)calc_pages_for(offset, length);
2389 obj_request->page_count = page_count;
2390 if ((offset + length) & ~PAGE_MASK)
2391 page_count--; /* more on last page */
2392 pages += page_count;
2393 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002394
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002395 if (img_request_discard_test(img_request)) {
2396 op_type = OBJ_OP_DISCARD;
2397 if (!offset && (length == object_size)
2398 && (!img_request_layered_test(img_request) ||
2399 (rbd_dev->parent_overlap <=
Josh Durgin3c5df892014-04-04 12:06:32 -07002400 obj_request->img_offset))) {
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002401 opcode = CEPH_OSD_OP_DELETE;
Josh Durgin3c5df892014-04-04 12:06:32 -07002402 } else if ((offset + length == object_size)) {
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002403 opcode = CEPH_OSD_OP_TRUNCATE;
Josh Durgin3c5df892014-04-04 12:06:32 -07002404 } else {
2405 down_read(&rbd_dev->header_rwsem);
2406 img_end = rbd_dev->header.image_size;
2407 up_read(&rbd_dev->header_rwsem);
2408
2409 if (obj_request->img_offset + length == img_end)
2410 opcode = CEPH_OSD_OP_TRUNCATE;
2411 else
2412 opcode = CEPH_OSD_OP_ZERO;
2413 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002414 } else if (img_request_write_test(img_request)) {
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002415 op_type = OBJ_OP_WRITE;
2416 opcode = CEPH_OSD_OP_WRITE;
2417 } else {
2418 op_type = OBJ_OP_READ;
2419 opcode = CEPH_OSD_OP_READ;
2420 }
2421
2422 osd_req = rbd_osd_req_create(rbd_dev, op_type,
2423 (op_type == OBJ_OP_WRITE) ? 2 : 1,
2424 obj_request);
Alex Elder2fa12322013-04-05 01:27:12 -05002425 if (!osd_req)
Ilya Dryomov62054da2014-03-04 11:57:17 +02002426 goto out_unwind;
Alex Elder2fa12322013-04-05 01:27:12 -05002427 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05002428 obj_request->callback = rbd_img_obj_callback;
Alex Elder0f2d5be2014-04-26 14:21:44 +04002429 rbd_img_request_get(img_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002430
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002431 if (op_type == OBJ_OP_WRITE) {
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002432 osd_req_op_alloc_hint_init(osd_req, which,
2433 rbd_obj_bytes(&rbd_dev->header),
2434 rbd_obj_bytes(&rbd_dev->header));
2435 which++;
2436 }
2437
2438 osd_req_op_extent_init(osd_req, which, opcode, offset, length,
2439 0, 0);
Alex Elderf1a47392013-04-19 15:34:50 -05002440 if (type == OBJ_REQUEST_BIO)
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002441 osd_req_op_extent_osd_data_bio(osd_req, which,
Alex Elderf1a47392013-04-19 15:34:50 -05002442 obj_request->bio_list, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002443 else if (type == OBJ_REQUEST_PAGES)
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002444 osd_req_op_extent_osd_data_pages(osd_req, which,
Alex Elderf1a47392013-04-19 15:34:50 -05002445 obj_request->pages, length,
2446 offset & ~PAGE_MASK, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002447
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002448 /* Discards are also writes */
2449 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
Alex Elder9d4df012013-04-19 15:34:50 -05002450 rbd_osd_req_format_write(obj_request);
2451 else
2452 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002453
Alex Elder7da22d22013-01-24 16:13:36 -06002454 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002455
Alex Elder7da22d22013-01-24 16:13:36 -06002456 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002457 resid -= length;
2458 }
2459
2460 return 0;
2461
Alex Elderbf0d5f502012-11-22 00:00:08 -06002462out_unwind:
2463 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
Ilya Dryomov42dd0372014-03-04 11:57:17 +02002464 rbd_img_obj_request_del(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002465
2466 return -ENOMEM;
2467}
2468
Alex Elder3d7efd12013-04-19 15:34:50 -05002469static void
Alex Elder0eefd472013-04-19 15:34:50 -05002470rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2471{
2472 struct rbd_img_request *img_request;
2473 struct rbd_device *rbd_dev;
Alex Elderebda6402013-05-10 16:29:22 -05002474 struct page **pages;
Alex Elder0eefd472013-04-19 15:34:50 -05002475 u32 page_count;
2476
2477 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2478 rbd_assert(obj_request_img_data_test(obj_request));
2479 img_request = obj_request->img_request;
2480 rbd_assert(img_request);
2481
2482 rbd_dev = img_request->rbd_dev;
2483 rbd_assert(rbd_dev);
Alex Elder0eefd472013-04-19 15:34:50 -05002484
Alex Elderebda6402013-05-10 16:29:22 -05002485 pages = obj_request->copyup_pages;
2486 rbd_assert(pages != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002487 obj_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002488 page_count = obj_request->copyup_page_count;
2489 rbd_assert(page_count);
2490 obj_request->copyup_page_count = 0;
2491 ceph_release_page_vector(pages, page_count);
Alex Elder0eefd472013-04-19 15:34:50 -05002492
2493 /*
2494 * We want the transfer count to reflect the size of the
2495 * original write request. There is no such thing as a
2496 * successful short write, so if the request was successful
2497 * we can just set it to the originally-requested length.
2498 */
2499 if (!obj_request->result)
2500 obj_request->xferred = obj_request->length;
2501
2502 /* Finish up with the normal image object callback */
2503
2504 rbd_img_obj_callback(obj_request);
2505}
2506
2507static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002508rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2509{
2510 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002511 struct ceph_osd_request *osd_req;
2512 struct ceph_osd_client *osdc;
2513 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002514 struct page **pages;
Alex Elderebda6402013-05-10 16:29:22 -05002515 u32 page_count;
Alex Elderbbea1c12013-05-06 17:40:33 -05002516 int img_result;
Alex Elderebda6402013-05-10 16:29:22 -05002517 u64 parent_length;
Alex Elderb91f09f2013-05-10 16:29:22 -05002518 u64 offset;
2519 u64 length;
Alex Elder3d7efd12013-04-19 15:34:50 -05002520
2521 rbd_assert(img_request_child_test(img_request));
2522
2523 /* First get what we need from the image request */
2524
2525 pages = img_request->copyup_pages;
2526 rbd_assert(pages != NULL);
2527 img_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002528 page_count = img_request->copyup_page_count;
2529 rbd_assert(page_count);
2530 img_request->copyup_page_count = 0;
Alex Elder3d7efd12013-04-19 15:34:50 -05002531
2532 orig_request = img_request->obj_request;
2533 rbd_assert(orig_request != NULL);
Alex Elderb91f09f2013-05-10 16:29:22 -05002534 rbd_assert(obj_request_type_valid(orig_request->type));
Alex Elderbbea1c12013-05-06 17:40:33 -05002535 img_result = img_request->result;
Alex Elderebda6402013-05-10 16:29:22 -05002536 parent_length = img_request->length;
2537 rbd_assert(parent_length == img_request->xferred);
Alex Elder3d7efd12013-04-19 15:34:50 -05002538 rbd_img_request_put(img_request);
2539
Alex Elder91c6feb2013-05-06 17:40:32 -05002540 rbd_assert(orig_request->img_request);
2541 rbd_dev = orig_request->img_request->rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002542 rbd_assert(rbd_dev);
Alex Elder3d7efd12013-04-19 15:34:50 -05002543
Alex Elderbbea1c12013-05-06 17:40:33 -05002544 /*
2545 * If the overlap has become 0 (most likely because the
2546 * image has been flattened) we need to free the pages
2547 * and re-submit the original write request.
2548 */
2549 if (!rbd_dev->parent_overlap) {
2550 struct ceph_osd_client *osdc;
2551
2552 ceph_release_page_vector(pages, page_count);
2553 osdc = &rbd_dev->rbd_client->client->osdc;
2554 img_result = rbd_obj_request_submit(osdc, orig_request);
2555 if (!img_result)
2556 return;
2557 }
2558
2559 if (img_result)
Alex Elder0eefd472013-04-19 15:34:50 -05002560 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002561
Alex Elder8785b1d2013-05-09 10:08:49 -05002562 /*
2563 * The original osd request is of no use to use any more.
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002564 * We need a new one that can hold the three ops in a copyup
Alex Elder8785b1d2013-05-09 10:08:49 -05002565 * request. Allocate the new copyup osd request for the
2566 * original request, and release the old one.
2567 */
Alex Elderbbea1c12013-05-06 17:40:33 -05002568 img_result = -ENOMEM;
Alex Elder0eefd472013-04-19 15:34:50 -05002569 osd_req = rbd_osd_req_create_copyup(orig_request);
2570 if (!osd_req)
2571 goto out_err;
Alex Elder8785b1d2013-05-09 10:08:49 -05002572 rbd_osd_req_destroy(orig_request->osd_req);
Alex Elder0eefd472013-04-19 15:34:50 -05002573 orig_request->osd_req = osd_req;
2574 orig_request->copyup_pages = pages;
Alex Elderebda6402013-05-10 16:29:22 -05002575 orig_request->copyup_page_count = page_count;
Alex Elder3d7efd12013-04-19 15:34:50 -05002576
Alex Elder0eefd472013-04-19 15:34:50 -05002577 /* Initialize the copyup op */
2578
2579 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
Alex Elderebda6402013-05-10 16:29:22 -05002580 osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
Alex Elder0eefd472013-04-19 15:34:50 -05002581 false, false);
2582
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002583 /* Then the hint op */
2584
2585 osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
2586 rbd_obj_bytes(&rbd_dev->header));
2587
2588 /* And the original write request op */
Alex Elder0eefd472013-04-19 15:34:50 -05002589
Alex Elderb91f09f2013-05-10 16:29:22 -05002590 offset = orig_request->offset;
2591 length = orig_request->length;
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002592 osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
Alex Elderb91f09f2013-05-10 16:29:22 -05002593 offset, length, 0, 0);
2594 if (orig_request->type == OBJ_REQUEST_BIO)
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002595 osd_req_op_extent_osd_data_bio(osd_req, 2,
Alex Elderb91f09f2013-05-10 16:29:22 -05002596 orig_request->bio_list, length);
2597 else
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002598 osd_req_op_extent_osd_data_pages(osd_req, 2,
Alex Elderb91f09f2013-05-10 16:29:22 -05002599 orig_request->pages, length,
2600 offset & ~PAGE_MASK, false, false);
Alex Elder0eefd472013-04-19 15:34:50 -05002601
2602 rbd_osd_req_format_write(orig_request);
2603
2604 /* All set, send it off. */
2605
2606 orig_request->callback = rbd_img_obj_copyup_callback;
2607 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderbbea1c12013-05-06 17:40:33 -05002608 img_result = rbd_obj_request_submit(osdc, orig_request);
2609 if (!img_result)
Alex Elder0eefd472013-04-19 15:34:50 -05002610 return;
2611out_err:
2612 /* Record the error code and complete the request */
2613
Alex Elderbbea1c12013-05-06 17:40:33 -05002614 orig_request->result = img_result;
Alex Elder0eefd472013-04-19 15:34:50 -05002615 orig_request->xferred = 0;
2616 obj_request_done_set(orig_request);
2617 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002618}
2619
2620/*
2621 * Read from the parent image the range of data that covers the
2622 * entire target of the given object request. This is used for
2623 * satisfying a layered image write request when the target of an
2624 * object request from the image request does not exist.
2625 *
2626 * A page array big enough to hold the returned data is allocated
2627 * and supplied to rbd_img_request_fill() as the "data descriptor."
2628 * When the read completes, this page array will be transferred to
2629 * the original object request for the copyup operation.
2630 *
2631 * If an error occurs, record it as the result of the original
2632 * object request and mark it done so it gets completed.
2633 */
2634static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2635{
2636 struct rbd_img_request *img_request = NULL;
2637 struct rbd_img_request *parent_request = NULL;
2638 struct rbd_device *rbd_dev;
2639 u64 img_offset;
2640 u64 length;
2641 struct page **pages = NULL;
2642 u32 page_count;
2643 int result;
2644
2645 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderb91f09f2013-05-10 16:29:22 -05002646 rbd_assert(obj_request_type_valid(obj_request->type));
Alex Elder3d7efd12013-04-19 15:34:50 -05002647
2648 img_request = obj_request->img_request;
2649 rbd_assert(img_request != NULL);
2650 rbd_dev = img_request->rbd_dev;
2651 rbd_assert(rbd_dev->parent != NULL);
2652
2653 /*
2654 * Determine the byte range covered by the object in the
2655 * child image to which the original request was to be sent.
2656 */
2657 img_offset = obj_request->img_offset - obj_request->offset;
2658 length = (u64)1 << rbd_dev->header.obj_order;
2659
2660 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002661 * There is no defined parent data beyond the parent
2662 * overlap, so limit what we read at that boundary if
2663 * necessary.
2664 */
2665 if (img_offset + length > rbd_dev->parent_overlap) {
2666 rbd_assert(img_offset < rbd_dev->parent_overlap);
2667 length = rbd_dev->parent_overlap - img_offset;
2668 }
2669
2670 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002671 * Allocate a page array big enough to receive the data read
2672 * from the parent.
2673 */
2674 page_count = (u32)calc_pages_for(0, length);
2675 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2676 if (IS_ERR(pages)) {
2677 result = PTR_ERR(pages);
2678 pages = NULL;
2679 goto out_err;
2680 }
2681
2682 result = -ENOMEM;
Alex Eldere93f3152013-05-08 22:50:04 -05002683 parent_request = rbd_parent_request_create(obj_request,
2684 img_offset, length);
Alex Elder3d7efd12013-04-19 15:34:50 -05002685 if (!parent_request)
2686 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002687
2688 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2689 if (result)
2690 goto out_err;
2691 parent_request->copyup_pages = pages;
Alex Elderebda6402013-05-10 16:29:22 -05002692 parent_request->copyup_page_count = page_count;
Alex Elder3d7efd12013-04-19 15:34:50 -05002693
2694 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2695 result = rbd_img_request_submit(parent_request);
2696 if (!result)
2697 return 0;
2698
2699 parent_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002700 parent_request->copyup_page_count = 0;
Alex Elder3d7efd12013-04-19 15:34:50 -05002701 parent_request->obj_request = NULL;
2702 rbd_obj_request_put(obj_request);
2703out_err:
2704 if (pages)
2705 ceph_release_page_vector(pages, page_count);
2706 if (parent_request)
2707 rbd_img_request_put(parent_request);
2708 obj_request->result = result;
2709 obj_request->xferred = 0;
2710 obj_request_done_set(obj_request);
2711
2712 return result;
2713}
2714
Alex Elderc5b5ef62013-02-11 12:33:24 -06002715static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2716{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002717 struct rbd_obj_request *orig_request;
Alex Elder638f5ab2013-05-06 17:40:33 -05002718 struct rbd_device *rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002719 int result;
2720
2721 rbd_assert(!obj_request_img_data_test(obj_request));
2722
2723 /*
2724 * All we need from the object request is the original
2725 * request and the result of the STAT op. Grab those, then
2726 * we're done with the request.
2727 */
2728 orig_request = obj_request->obj_request;
2729 obj_request->obj_request = NULL;
Alex Elder912c3172013-05-13 20:35:38 -05002730 rbd_obj_request_put(orig_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002731 rbd_assert(orig_request);
2732 rbd_assert(orig_request->img_request);
2733
2734 result = obj_request->result;
2735 obj_request->result = 0;
2736
2737 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2738 obj_request, orig_request, result,
2739 obj_request->xferred, obj_request->length);
2740 rbd_obj_request_put(obj_request);
2741
Alex Elder638f5ab2013-05-06 17:40:33 -05002742 /*
2743 * If the overlap has become 0 (most likely because the
2744 * image has been flattened) we need to free the pages
2745 * and re-submit the original write request.
2746 */
2747 rbd_dev = orig_request->img_request->rbd_dev;
2748 if (!rbd_dev->parent_overlap) {
2749 struct ceph_osd_client *osdc;
2750
Alex Elder638f5ab2013-05-06 17:40:33 -05002751 osdc = &rbd_dev->rbd_client->client->osdc;
2752 result = rbd_obj_request_submit(osdc, orig_request);
2753 if (!result)
2754 return;
2755 }
Alex Elderc5b5ef62013-02-11 12:33:24 -06002756
2757 /*
2758 * Our only purpose here is to determine whether the object
2759 * exists, and we don't want to treat the non-existence as
2760 * an error. If something else comes back, transfer the
2761 * error to the original request and complete it now.
2762 */
2763 if (!result) {
2764 obj_request_existence_set(orig_request, true);
2765 } else if (result == -ENOENT) {
2766 obj_request_existence_set(orig_request, false);
2767 } else if (result) {
2768 orig_request->result = result;
Alex Elder3d7efd12013-04-19 15:34:50 -05002769 goto out;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002770 }
2771
2772 /*
2773 * Resubmit the original request now that we have recorded
2774 * whether the target object exists.
2775 */
Alex Elderb454e362013-04-19 15:34:50 -05002776 orig_request->result = rbd_img_obj_request_submit(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002777out:
Alex Elderc5b5ef62013-02-11 12:33:24 -06002778 if (orig_request->result)
2779 rbd_obj_request_complete(orig_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002780}
2781
2782static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2783{
2784 struct rbd_obj_request *stat_request;
2785 struct rbd_device *rbd_dev;
2786 struct ceph_osd_client *osdc;
2787 struct page **pages = NULL;
2788 u32 page_count;
2789 size_t size;
2790 int ret;
2791
2792 /*
2793 * The response data for a STAT call consists of:
2794 * le64 length;
2795 * struct {
2796 * le32 tv_sec;
2797 * le32 tv_nsec;
2798 * } mtime;
2799 */
2800 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2801 page_count = (u32)calc_pages_for(0, size);
2802 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2803 if (IS_ERR(pages))
2804 return PTR_ERR(pages);
2805
2806 ret = -ENOMEM;
2807 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2808 OBJ_REQUEST_PAGES);
2809 if (!stat_request)
2810 goto out;
2811
2812 rbd_obj_request_get(obj_request);
2813 stat_request->obj_request = obj_request;
2814 stat_request->pages = pages;
2815 stat_request->page_count = page_count;
2816
2817 rbd_assert(obj_request->img_request);
2818 rbd_dev = obj_request->img_request->rbd_dev;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002819 stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02002820 stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002821 if (!stat_request->osd_req)
2822 goto out;
2823 stat_request->callback = rbd_img_obj_exists_callback;
2824
2825 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2826 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2827 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002828 rbd_osd_req_format_read(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002829
2830 osdc = &rbd_dev->rbd_client->client->osdc;
2831 ret = rbd_obj_request_submit(osdc, stat_request);
2832out:
2833 if (ret)
2834 rbd_obj_request_put(obj_request);
2835
2836 return ret;
2837}
2838
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002839static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
Alex Elderb454e362013-04-19 15:34:50 -05002840{
2841 struct rbd_img_request *img_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002842 struct rbd_device *rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002843
2844 rbd_assert(obj_request_img_data_test(obj_request));
2845
2846 img_request = obj_request->img_request;
2847 rbd_assert(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002848 rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002849
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002850 /* Reads */
2851 if (!img_request_write_test(img_request))
2852 return true;
Alex Elderb454e362013-04-19 15:34:50 -05002853
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002854 /* Non-layered writes */
2855 if (!img_request_layered_test(img_request))
2856 return true;
2857
2858 /*
2859 * Layered writes outside of the parent overlap range don't
2860 * share any data with the parent.
2861 */
2862 if (!obj_request_overlaps_parent(obj_request))
2863 return true;
2864
2865 /*
Guangliang Zhaoc622d222014-04-01 22:22:15 +08002866 * Entire-object layered writes - we will overwrite whatever
2867 * parent data there is anyway.
2868 */
2869 if (!obj_request->offset &&
2870 obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2871 return true;
2872
2873 /*
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002874 * If the object is known to already exist, its parent data has
2875 * already been copied.
2876 */
2877 if (obj_request_known_test(obj_request) &&
2878 obj_request_exists_test(obj_request))
2879 return true;
2880
2881 return false;
2882}
2883
2884static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2885{
2886 if (img_obj_request_simple(obj_request)) {
Alex Elderb454e362013-04-19 15:34:50 -05002887 struct rbd_device *rbd_dev;
2888 struct ceph_osd_client *osdc;
2889
2890 rbd_dev = obj_request->img_request->rbd_dev;
2891 osdc = &rbd_dev->rbd_client->client->osdc;
2892
2893 return rbd_obj_request_submit(osdc, obj_request);
2894 }
2895
2896 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002897 * It's a layered write. The target object might exist but
2898 * we may not know that yet. If we know it doesn't exist,
2899 * start by reading the data for the full target object from
2900 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002901 */
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002902 if (obj_request_known_test(obj_request))
Alex Elder3d7efd12013-04-19 15:34:50 -05002903 return rbd_img_obj_parent_read_full(obj_request);
2904
2905 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002906
2907 return rbd_img_obj_exists_submit(obj_request);
2908}
2909
Alex Elderbf0d5f502012-11-22 00:00:08 -06002910static int rbd_img_request_submit(struct rbd_img_request *img_request)
2911{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002912 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002913 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002914
Alex Elder37206ee2013-02-20 17:32:08 -06002915 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05002916 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002917 int ret;
2918
Alex Elderb454e362013-04-19 15:34:50 -05002919 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002920 if (ret)
2921 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002922 }
2923
2924 return 0;
2925}
2926
Alex Elder8b3e1a52013-01-24 16:13:36 -06002927static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2928{
2929 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002930 struct rbd_device *rbd_dev;
2931 u64 obj_end;
Alex Elder02c74fb2013-05-06 17:40:33 -05002932 u64 img_xferred;
2933 int img_result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002934
2935 rbd_assert(img_request_child_test(img_request));
2936
Alex Elder02c74fb2013-05-06 17:40:33 -05002937 /* First get what we need from the image request and release it */
2938
Alex Elder8b3e1a52013-01-24 16:13:36 -06002939 obj_request = img_request->obj_request;
Alex Elder02c74fb2013-05-06 17:40:33 -05002940 img_xferred = img_request->xferred;
2941 img_result = img_request->result;
2942 rbd_img_request_put(img_request);
2943
2944 /*
2945 * If the overlap has become 0 (most likely because the
2946 * image has been flattened) we need to re-submit the
2947 * original request.
2948 */
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002949 rbd_assert(obj_request);
2950 rbd_assert(obj_request->img_request);
Alex Elder02c74fb2013-05-06 17:40:33 -05002951 rbd_dev = obj_request->img_request->rbd_dev;
2952 if (!rbd_dev->parent_overlap) {
2953 struct ceph_osd_client *osdc;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002954
Alex Elder02c74fb2013-05-06 17:40:33 -05002955 osdc = &rbd_dev->rbd_client->client->osdc;
2956 img_result = rbd_obj_request_submit(osdc, obj_request);
2957 if (!img_result)
2958 return;
2959 }
2960
2961 obj_request->result = img_result;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002962 if (obj_request->result)
2963 goto out;
2964
2965 /*
2966 * We need to zero anything beyond the parent overlap
2967 * boundary. Since rbd_img_obj_request_read_callback()
2968 * will zero anything beyond the end of a short read, an
2969 * easy way to do this is to pretend the data from the
2970 * parent came up short--ending at the overlap boundary.
2971 */
2972 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2973 obj_end = obj_request->img_offset + obj_request->length;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002974 if (obj_end > rbd_dev->parent_overlap) {
2975 u64 xferred = 0;
2976
2977 if (obj_request->img_offset < rbd_dev->parent_overlap)
2978 xferred = rbd_dev->parent_overlap -
2979 obj_request->img_offset;
2980
Alex Elder02c74fb2013-05-06 17:40:33 -05002981 obj_request->xferred = min(img_xferred, xferred);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002982 } else {
Alex Elder02c74fb2013-05-06 17:40:33 -05002983 obj_request->xferred = img_xferred;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002984 }
2985out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06002986 rbd_img_obj_request_read_callback(obj_request);
2987 rbd_obj_request_complete(obj_request);
2988}
2989
2990static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2991{
Alex Elder8b3e1a52013-01-24 16:13:36 -06002992 struct rbd_img_request *img_request;
2993 int result;
2994
2995 rbd_assert(obj_request_img_data_test(obj_request));
2996 rbd_assert(obj_request->img_request != NULL);
2997 rbd_assert(obj_request->result == (s32) -ENOENT);
Alex Elder5b2ab722013-05-06 17:40:33 -05002998 rbd_assert(obj_request_type_valid(obj_request->type));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002999
Alex Elder8b3e1a52013-01-24 16:13:36 -06003000 /* rbd_read_finish(obj_request, obj_request->length); */
Alex Eldere93f3152013-05-08 22:50:04 -05003001 img_request = rbd_parent_request_create(obj_request,
Alex Elder8b3e1a52013-01-24 16:13:36 -06003002 obj_request->img_offset,
Alex Eldere93f3152013-05-08 22:50:04 -05003003 obj_request->length);
Alex Elder8b3e1a52013-01-24 16:13:36 -06003004 result = -ENOMEM;
3005 if (!img_request)
3006 goto out_err;
3007
Alex Elder5b2ab722013-05-06 17:40:33 -05003008 if (obj_request->type == OBJ_REQUEST_BIO)
3009 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3010 obj_request->bio_list);
3011 else
3012 result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
3013 obj_request->pages);
Alex Elder8b3e1a52013-01-24 16:13:36 -06003014 if (result)
3015 goto out_err;
3016
3017 img_request->callback = rbd_img_parent_read_callback;
3018 result = rbd_img_request_submit(img_request);
3019 if (result)
3020 goto out_err;
3021
3022 return;
3023out_err:
3024 if (img_request)
3025 rbd_img_request_put(img_request);
3026 obj_request->result = result;
3027 obj_request->xferred = 0;
3028 obj_request_done_set(obj_request);
3029}
3030
Josh Durgin20e0af62013-08-29 17:36:03 -07003031static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
Alex Elderb8d70032012-11-30 17:53:04 -06003032{
3033 struct rbd_obj_request *obj_request;
Alex Elder21692382013-04-05 01:27:12 -05003034 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderb8d70032012-11-30 17:53:04 -06003035 int ret;
3036
3037 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3038 OBJ_REQUEST_NODATA);
3039 if (!obj_request)
3040 return -ENOMEM;
3041
3042 ret = -ENOMEM;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003043 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02003044 obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06003045 if (!obj_request->osd_req)
3046 goto out;
3047
Alex Elderc99d2d42013-04-05 01:27:11 -05003048 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003049 notify_id, 0, 0);
Alex Elder9d4df012013-04-19 15:34:50 -05003050 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05003051
Alex Elderb8d70032012-11-30 17:53:04 -06003052 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Eldercf81b602013-01-17 12:18:46 -06003053 if (ret)
Josh Durgin20e0af62013-08-29 17:36:03 -07003054 goto out;
3055 ret = rbd_obj_request_wait(obj_request);
3056out:
3057 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06003058
3059 return ret;
3060}
3061
3062static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
3063{
3064 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Alex Eldere627db02013-05-06 07:40:30 -05003065 int ret;
Alex Elderb8d70032012-11-30 17:53:04 -06003066
3067 if (!rbd_dev)
3068 return;
3069
Alex Elder37206ee2013-02-20 17:32:08 -06003070 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003071 rbd_dev->header_name, (unsigned long long)notify_id,
3072 (unsigned int)opcode);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003073
3074 /*
3075 * Until adequate refresh error handling is in place, there is
3076 * not much we can do here, except warn.
3077 *
3078 * See http://tracker.ceph.com/issues/5040
3079 */
Alex Eldere627db02013-05-06 07:40:30 -05003080 ret = rbd_dev_refresh(rbd_dev);
3081 if (ret)
Ilya Dryomov9584d502014-07-11 12:11:20 +04003082 rbd_warn(rbd_dev, "refresh failed: %d", ret);
Alex Elderb8d70032012-11-30 17:53:04 -06003083
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003084 ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
3085 if (ret)
Ilya Dryomov9584d502014-07-11 12:11:20 +04003086 rbd_warn(rbd_dev, "notify_ack ret %d", ret);
Alex Elderb8d70032012-11-30 17:53:04 -06003087}
3088
Alex Elder9969ebc2013-01-18 12:31:10 -06003089/*
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003090 * Send a (un)watch request and wait for the ack. Return a request
3091 * with a ref held on success or error.
3092 */
3093static struct rbd_obj_request *rbd_obj_watch_request_helper(
3094 struct rbd_device *rbd_dev,
3095 bool watch)
3096{
3097 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3098 struct rbd_obj_request *obj_request;
3099 int ret;
3100
3101 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3102 OBJ_REQUEST_NODATA);
3103 if (!obj_request)
3104 return ERR_PTR(-ENOMEM);
3105
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003106 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003107 obj_request);
3108 if (!obj_request->osd_req) {
3109 ret = -ENOMEM;
3110 goto out;
3111 }
3112
3113 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
3114 rbd_dev->watch_event->cookie, 0, watch);
3115 rbd_osd_req_format_write(obj_request);
3116
3117 if (watch)
3118 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
3119
3120 ret = rbd_obj_request_submit(osdc, obj_request);
3121 if (ret)
3122 goto out;
3123
3124 ret = rbd_obj_request_wait(obj_request);
3125 if (ret)
3126 goto out;
3127
3128 ret = obj_request->result;
3129 if (ret) {
3130 if (watch)
3131 rbd_obj_request_end(obj_request);
3132 goto out;
3133 }
3134
3135 return obj_request;
3136
3137out:
3138 rbd_obj_request_put(obj_request);
3139 return ERR_PTR(ret);
3140}
3141
3142/*
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003143 * Initiate a watch request, synchronously.
Alex Elder9969ebc2013-01-18 12:31:10 -06003144 */
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003145static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003146{
3147 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3148 struct rbd_obj_request *obj_request;
Alex Elder9969ebc2013-01-18 12:31:10 -06003149 int ret;
3150
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003151 rbd_assert(!rbd_dev->watch_event);
3152 rbd_assert(!rbd_dev->watch_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06003153
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003154 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
3155 &rbd_dev->watch_event);
3156 if (ret < 0)
3157 return ret;
Alex Elder9969ebc2013-01-18 12:31:10 -06003158
Ilya Dryomov76756a52014-06-20 18:29:20 +04003159 obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
3160 if (IS_ERR(obj_request)) {
3161 ceph_osdc_cancel_event(rbd_dev->watch_event);
3162 rbd_dev->watch_event = NULL;
3163 return PTR_ERR(obj_request);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003164 }
Alex Elder9969ebc2013-01-18 12:31:10 -06003165
Alex Elder8eb87562013-01-25 17:08:55 -06003166 /*
3167 * A watch request is set to linger, so the underlying osd
3168 * request won't go away until we unregister it. We retain
3169 * a pointer to the object request during that time (in
Ilya Dryomov76756a52014-06-20 18:29:20 +04003170 * rbd_dev->watch_request), so we'll keep a reference to it.
3171 * We'll drop that reference after we've unregistered it in
3172 * rbd_dev_header_unwatch_sync().
Alex Elder8eb87562013-01-25 17:08:55 -06003173 */
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003174 rbd_dev->watch_request = obj_request;
Alex Elder8eb87562013-01-25 17:08:55 -06003175
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003176 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003177}
3178
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003179/*
3180 * Tear down a watch request, synchronously.
3181 */
Ilya Dryomov76756a52014-06-20 18:29:20 +04003182static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003183{
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003184 struct rbd_obj_request *obj_request;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003185
3186 rbd_assert(rbd_dev->watch_event);
3187 rbd_assert(rbd_dev->watch_request);
3188
Ilya Dryomov76756a52014-06-20 18:29:20 +04003189 rbd_obj_request_end(rbd_dev->watch_request);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003190 rbd_obj_request_put(rbd_dev->watch_request);
3191 rbd_dev->watch_request = NULL;
3192
Ilya Dryomov76756a52014-06-20 18:29:20 +04003193 obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
3194 if (!IS_ERR(obj_request))
3195 rbd_obj_request_put(obj_request);
3196 else
3197 rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
3198 PTR_ERR(obj_request));
3199
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003200 ceph_osdc_cancel_event(rbd_dev->watch_event);
3201 rbd_dev->watch_event = NULL;
Ilya Dryomovfca27062013-12-16 18:02:40 +02003202}
3203
Alex Elder36be9a72013-01-19 00:30:28 -06003204/*
Alex Elderf40eb342013-04-25 15:09:42 -05003205 * Synchronous osd object method call. Returns the number of bytes
3206 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003207 */
3208static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3209 const char *object_name,
3210 const char *class_name,
3211 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003212 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003213 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003214 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003215 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003216{
Alex Elder21692382013-04-05 01:27:12 -05003217 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder36be9a72013-01-19 00:30:28 -06003218 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06003219 struct page **pages;
3220 u32 page_count;
3221 int ret;
3222
3223 /*
Alex Elder6010a452013-04-05 01:27:11 -05003224 * Method calls are ultimately read operations. The result
3225 * should placed into the inbound buffer provided. They
3226 * also supply outbound data--parameters for the object
3227 * method. Currently if this is present it will be a
3228 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003229 */
Alex Elder57385b52013-04-21 12:14:45 -05003230 page_count = (u32)calc_pages_for(0, inbound_size);
Alex Elder36be9a72013-01-19 00:30:28 -06003231 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3232 if (IS_ERR(pages))
3233 return PTR_ERR(pages);
3234
3235 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05003236 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06003237 OBJ_REQUEST_PAGES);
3238 if (!obj_request)
3239 goto out;
3240
3241 obj_request->pages = pages;
3242 obj_request->page_count = page_count;
3243
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003244 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02003245 obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06003246 if (!obj_request->osd_req)
3247 goto out;
3248
Alex Elderc99d2d42013-04-05 01:27:11 -05003249 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05003250 class_name, method_name);
3251 if (outbound_size) {
3252 struct ceph_pagelist *pagelist;
3253
3254 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
3255 if (!pagelist)
3256 goto out;
3257
3258 ceph_pagelist_init(pagelist);
3259 ceph_pagelist_append(pagelist, outbound, outbound_size);
3260 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
3261 pagelist);
3262 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05003263 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3264 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05003265 0, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05003266 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05003267
Alex Elder36be9a72013-01-19 00:30:28 -06003268 ret = rbd_obj_request_submit(osdc, obj_request);
3269 if (ret)
3270 goto out;
3271 ret = rbd_obj_request_wait(obj_request);
3272 if (ret)
3273 goto out;
3274
3275 ret = obj_request->result;
3276 if (ret < 0)
3277 goto out;
Alex Elder57385b52013-04-21 12:14:45 -05003278
3279 rbd_assert(obj_request->xferred < (u64)INT_MAX);
3280 ret = (int)obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06003281 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06003282out:
3283 if (obj_request)
3284 rbd_obj_request_put(obj_request);
3285 else
3286 ceph_release_page_vector(pages, page_count);
3287
3288 return ret;
3289}
3290
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003291static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
3292{
3293 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003294 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003295 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3296 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003297 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003298 u64 mapping_size;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003299 int result;
3300
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003301 if (rq->cmd_flags & REQ_DISCARD)
3302 op_type = OBJ_OP_DISCARD;
3303 else if (rq->cmd_flags & REQ_WRITE)
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003304 op_type = OBJ_OP_WRITE;
3305 else
3306 op_type = OBJ_OP_READ;
3307
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003308 /* Ignore/skip any zero-length requests */
3309
3310 if (!length) {
3311 dout("%s: zero-length request\n", __func__);
3312 result = 0;
3313 goto err_rq;
3314 }
3315
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003316 /* Only reads are allowed to a read-only device */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003317
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003318 if (op_type != OBJ_OP_READ) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003319 if (rbd_dev->mapping.read_only) {
3320 result = -EROFS;
3321 goto err_rq;
3322 }
3323 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3324 }
3325
3326 /*
3327 * Quit early if the mapped snapshot no longer exists. It's
3328 * still possible the snapshot will have disappeared by the
3329 * time our request arrives at the osd, but there's no sense in
3330 * sending it if we already know.
3331 */
3332 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3333 dout("request for non-existent snapshot");
3334 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3335 result = -ENXIO;
3336 goto err_rq;
3337 }
3338
3339 if (offset && length > U64_MAX - offset + 1) {
3340 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3341 length);
3342 result = -EINVAL;
3343 goto err_rq; /* Shouldn't happen */
3344 }
3345
Josh Durgin4e752f02014-04-08 11:12:11 -07003346 down_read(&rbd_dev->header_rwsem);
3347 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003348 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003349 snapc = rbd_dev->header.snapc;
3350 ceph_get_snap_context(snapc);
3351 }
3352 up_read(&rbd_dev->header_rwsem);
3353
3354 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003355 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003356 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003357 result = -EIO;
3358 goto err_rq;
3359 }
3360
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003361 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07003362 snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003363 if (!img_request) {
3364 result = -ENOMEM;
3365 goto err_rq;
3366 }
3367 img_request->rq = rq;
3368
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003369 if (op_type == OBJ_OP_DISCARD)
3370 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
3371 NULL);
3372 else
3373 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3374 rq->bio);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003375 if (result)
3376 goto err_img_request;
3377
3378 result = rbd_img_request_submit(img_request);
3379 if (result)
3380 goto err_img_request;
3381
3382 return;
3383
3384err_img_request:
3385 rbd_img_request_put(img_request);
3386err_rq:
3387 if (result)
3388 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003389 obj_op_name(op_type), length, offset, result);
Josh Durgin4e752f02014-04-08 11:12:11 -07003390 if (snapc)
3391 ceph_put_snap_context(snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003392 blk_end_request_all(rq, result);
3393}
3394
3395static void rbd_request_workfn(struct work_struct *work)
3396{
3397 struct rbd_device *rbd_dev =
3398 container_of(work, struct rbd_device, rq_work);
3399 struct request *rq, *next;
3400 LIST_HEAD(requests);
3401
3402 spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
3403 list_splice_init(&rbd_dev->rq_queue, &requests);
3404 spin_unlock_irq(&rbd_dev->lock);
3405
3406 list_for_each_entry_safe(rq, next, &requests, queuelist) {
3407 list_del_init(&rq->queuelist);
3408 rbd_handle_request(rbd_dev, rq);
3409 }
3410}
3411
3412/*
3413 * Called with q->queue_lock held and interrupts disabled, possibly on
3414 * the way to schedule(). Do not sleep here!
3415 */
Alex Elderbf0d5f502012-11-22 00:00:08 -06003416static void rbd_request_fn(struct request_queue *q)
3417{
3418 struct rbd_device *rbd_dev = q->queuedata;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003419 struct request *rq;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003420 int queued = 0;
3421
3422 rbd_assert(rbd_dev);
Alex Elderbf0d5f502012-11-22 00:00:08 -06003423
3424 while ((rq = blk_fetch_request(q))) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06003425 /* Ignore any non-FS requests that filter through. */
Alex Elderbf0d5f502012-11-22 00:00:08 -06003426 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06003427 dout("%s: non-fs request type %d\n", __func__,
3428 (int) rq->cmd_type);
3429 __blk_end_request_all(rq, 0);
3430 continue;
3431 }
3432
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003433 list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
3434 queued++;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003435 }
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003436
3437 if (queued)
3438 queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06003439}
3440
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003441/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003442 * a queue callback. Makes sure that we don't create a bio that spans across
3443 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05003444 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003445 */
3446static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3447 struct bio_vec *bvec)
3448{
3449 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05003450 sector_t sector_offset;
3451 sector_t sectors_per_obj;
3452 sector_t obj_sector_offset;
3453 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003454
Alex Eldere5cfeed22012-10-20 22:17:27 -05003455 /*
3456 * Find how far into its rbd object the partition-relative
3457 * bio start sector is to offset relative to the enclosing
3458 * device.
3459 */
3460 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3461 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3462 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06003463
Alex Eldere5cfeed22012-10-20 22:17:27 -05003464 /*
3465 * Compute the number of bytes from that offset to the end
3466 * of the object. Account for what's already used by the bio.
3467 */
3468 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3469 if (ret > bmd->bi_size)
3470 ret -= bmd->bi_size;
3471 else
3472 ret = 0;
3473
3474 /*
3475 * Don't send back more than was asked for. And if the bio
3476 * was empty, let the whole thing through because: "Note
3477 * that a block device *must* allow a single page to be
3478 * added to an empty bio."
3479 */
3480 rbd_assert(bvec->bv_len <= PAGE_SIZE);
3481 if (ret > (int) bvec->bv_len || !bmd->bi_size)
3482 ret = (int) bvec->bv_len;
3483
3484 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003485}
3486
3487static void rbd_free_disk(struct rbd_device *rbd_dev)
3488{
3489 struct gendisk *disk = rbd_dev->disk;
3490
3491 if (!disk)
3492 return;
3493
Alex Eldera0cab922013-04-25 23:15:08 -05003494 rbd_dev->disk = NULL;
3495 if (disk->flags & GENHD_FL_UP) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003496 del_gendisk(disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003497 if (disk->queue)
3498 blk_cleanup_queue(disk->queue);
3499 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003500 put_disk(disk);
3501}
3502
Alex Elder788e2df2013-01-17 12:25:27 -06003503static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3504 const char *object_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05003505 u64 offset, u64 length, void *buf)
Alex Elder788e2df2013-01-17 12:25:27 -06003506
3507{
Alex Elder21692382013-04-05 01:27:12 -05003508 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder788e2df2013-01-17 12:25:27 -06003509 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06003510 struct page **pages = NULL;
3511 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06003512 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06003513 int ret;
3514
3515 page_count = (u32) calc_pages_for(offset, length);
3516 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3517 if (IS_ERR(pages))
3518 ret = PTR_ERR(pages);
3519
3520 ret = -ENOMEM;
3521 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06003522 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06003523 if (!obj_request)
3524 goto out;
3525
3526 obj_request->pages = pages;
3527 obj_request->page_count = page_count;
3528
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003529 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02003530 obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06003531 if (!obj_request->osd_req)
3532 goto out;
3533
Alex Elderc99d2d42013-04-05 01:27:11 -05003534 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3535 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05003536 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05003537 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05003538 obj_request->length,
3539 obj_request->offset & ~PAGE_MASK,
3540 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05003541 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05003542
Alex Elder788e2df2013-01-17 12:25:27 -06003543 ret = rbd_obj_request_submit(osdc, obj_request);
3544 if (ret)
3545 goto out;
3546 ret = rbd_obj_request_wait(obj_request);
3547 if (ret)
3548 goto out;
3549
3550 ret = obj_request->result;
3551 if (ret < 0)
3552 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06003553
3554 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
3555 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06003556 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder7097f8d2013-04-30 00:44:33 -05003557 rbd_assert(size <= (size_t)INT_MAX);
3558 ret = (int)size;
Alex Elder788e2df2013-01-17 12:25:27 -06003559out:
3560 if (obj_request)
3561 rbd_obj_request_put(obj_request);
3562 else
3563 ceph_release_page_vector(pages, page_count);
3564
3565 return ret;
3566}
3567
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003568/*
Alex Elder662518b2013-05-06 09:51:29 -05003569 * Read the complete header for the given rbd device. On successful
3570 * return, the rbd_dev->header field will contain up-to-date
3571 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003572 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003573static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003574{
3575 struct rbd_image_header_ondisk *ondisk = NULL;
3576 u32 snap_count = 0;
3577 u64 names_size = 0;
3578 u32 want_count;
3579 int ret;
3580
3581 /*
3582 * The complete header will include an array of its 64-bit
3583 * snapshot ids, followed by the names of those snapshots as
3584 * a contiguous block of NUL-terminated strings. Note that
3585 * the number of snapshots could change by the time we read
3586 * it in, in which case we re-read it.
3587 */
3588 do {
3589 size_t size;
3590
3591 kfree(ondisk);
3592
3593 size = sizeof (*ondisk);
3594 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3595 size += names_size;
3596 ondisk = kmalloc(size, GFP_KERNEL);
3597 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003598 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003599
Alex Elder788e2df2013-01-17 12:25:27 -06003600 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05003601 0, size, ondisk);
Alex Elder4156d992012-08-02 11:29:46 -05003602 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003603 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003604 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003605 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003606 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3607 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003608 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003609 }
3610 if (!rbd_dev_ondisk_valid(ondisk)) {
3611 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003612 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003613 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003614 }
3615
3616 names_size = le64_to_cpu(ondisk->snap_names_len);
3617 want_count = snap_count;
3618 snap_count = le32_to_cpu(ondisk->snap_count);
3619 } while (snap_count != want_count);
3620
Alex Elder662518b2013-05-06 09:51:29 -05003621 ret = rbd_header_from_disk(rbd_dev, ondisk);
3622out:
Alex Elder4156d992012-08-02 11:29:46 -05003623 kfree(ondisk);
3624
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003625 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003626}
3627
Alex Elder15228ed2013-05-01 12:43:03 -05003628/*
3629 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3630 * has disappeared from the (just updated) snapshot context.
3631 */
3632static void rbd_exists_validate(struct rbd_device *rbd_dev)
3633{
3634 u64 snap_id;
3635
3636 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3637 return;
3638
3639 snap_id = rbd_dev->spec->snap_id;
3640 if (snap_id == CEPH_NOSNAP)
3641 return;
3642
3643 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3644 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3645}
3646
Josh Durgin98752012013-08-29 17:26:31 -07003647static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3648{
3649 sector_t size;
3650 bool removing;
3651
3652 /*
3653 * Don't hold the lock while doing disk operations,
3654 * or lock ordering will conflict with the bdev mutex via:
3655 * rbd_add() -> blkdev_get() -> rbd_open()
3656 */
3657 spin_lock_irq(&rbd_dev->lock);
3658 removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
3659 spin_unlock_irq(&rbd_dev->lock);
3660 /*
3661 * If the device is being removed, rbd_dev->disk has
3662 * been destroyed, so don't try to update its size
3663 */
3664 if (!removing) {
3665 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3666 dout("setting size to %llu sectors", (unsigned long long)size);
3667 set_capacity(rbd_dev->disk, size);
3668 revalidate_disk(rbd_dev->disk);
3669 }
3670}
3671
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003672static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003673{
Alex Eldere627db02013-05-06 07:40:30 -05003674 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003675 int ret;
3676
Alex Eldercfbf6372013-05-31 17:40:45 -05003677 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05003678 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04003679
3680 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003681 if (ret)
3682 return ret;
Alex Elder15228ed2013-05-01 12:43:03 -05003683
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003684 /*
3685 * If there is a parent, see if it has disappeared due to the
3686 * mapped image getting flattened.
3687 */
3688 if (rbd_dev->parent) {
3689 ret = rbd_dev_v2_parent_info(rbd_dev);
3690 if (ret)
3691 return ret;
3692 }
3693
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003694 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
3695 if (rbd_dev->mapping.size != rbd_dev->header.image_size)
3696 rbd_dev->mapping.size = rbd_dev->header.image_size;
3697 } else {
3698 /* validate mapped snapshot's EXISTS flag */
3699 rbd_exists_validate(rbd_dev);
3700 }
Alex Elder15228ed2013-05-01 12:43:03 -05003701
Alex Eldercfbf6372013-05-31 17:40:45 -05003702 up_write(&rbd_dev->header_rwsem);
3703
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003704 if (mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07003705 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003706
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003707 return 0;
Alex Elder1fe5e992012-07-25 09:32:41 -05003708}
3709
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003710static int rbd_init_disk(struct rbd_device *rbd_dev)
3711{
3712 struct gendisk *disk;
3713 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003714 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003715
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003716 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003717 disk = alloc_disk(single_major ?
3718 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3719 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003720 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003721 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003722
Alex Elderf0f8cef2012-01-29 13:57:44 -06003723 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003724 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003725 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003726 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003727 if (single_major)
3728 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003729 disk->fops = &rbd_bd_ops;
3730 disk->private_data = rbd_dev;
3731
Alex Elderbf0d5f502012-11-22 00:00:08 -06003732 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003733 if (!q)
3734 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003735
Alex Elder593a9e72012-02-07 12:03:37 -06003736 /* We use the default size, but let's be explicit about it. */
3737 blk_queue_physical_block_size(q, SECTOR_SIZE);
3738
Josh Durgin029bcbd2011-07-22 11:35:23 -07003739 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003740 segment_size = rbd_obj_bytes(&rbd_dev->header);
3741 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3742 blk_queue_max_segment_size(q, segment_size);
3743 blk_queue_io_min(q, segment_size);
3744 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003745
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003746 /* enable the discard support */
3747 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
3748 q->limits.discard_granularity = segment_size;
3749 q->limits.discard_alignment = segment_size;
3750
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003751 blk_queue_merge_bvec(q, rbd_merge_bvec);
3752 disk->queue = q;
3753
3754 q->queuedata = rbd_dev;
3755
3756 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003757
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003758 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003759out_disk:
3760 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003761
3762 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003763}
3764
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003765/*
3766 sysfs
3767*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003768
Alex Elder593a9e72012-02-07 12:03:37 -06003769static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3770{
3771 return container_of(dev, struct rbd_device, dev);
3772}
3773
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003774static ssize_t rbd_size_show(struct device *dev,
3775 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003776{
Alex Elder593a9e72012-02-07 12:03:37 -06003777 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003778
Alex Elderfc71d832013-04-26 15:44:36 -05003779 return sprintf(buf, "%llu\n",
3780 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003781}
3782
Alex Elder34b13182012-07-13 20:35:12 -05003783/*
3784 * Note this shows the features for whatever's mapped, which is not
3785 * necessarily the base image.
3786 */
3787static ssize_t rbd_features_show(struct device *dev,
3788 struct device_attribute *attr, char *buf)
3789{
3790 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3791
3792 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003793 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05003794}
3795
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003796static ssize_t rbd_major_show(struct device *dev,
3797 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003798{
Alex Elder593a9e72012-02-07 12:03:37 -06003799 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003800
Alex Elderfc71d832013-04-26 15:44:36 -05003801 if (rbd_dev->major)
3802 return sprintf(buf, "%d\n", rbd_dev->major);
3803
3804 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003805}
Alex Elderfc71d832013-04-26 15:44:36 -05003806
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003807static ssize_t rbd_minor_show(struct device *dev,
3808 struct device_attribute *attr, char *buf)
3809{
3810 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3811
3812 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003813}
3814
3815static ssize_t rbd_client_id_show(struct device *dev,
3816 struct device_attribute *attr, char *buf)
3817{
Alex Elder593a9e72012-02-07 12:03:37 -06003818 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003819
Alex Elder1dbb4392012-01-24 10:08:37 -06003820 return sprintf(buf, "client%lld\n",
3821 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003822}
3823
3824static ssize_t rbd_pool_show(struct device *dev,
3825 struct device_attribute *attr, char *buf)
3826{
Alex Elder593a9e72012-02-07 12:03:37 -06003827 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003828
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003829 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003830}
3831
Alex Elder9bb2f332012-07-12 10:46:35 -05003832static ssize_t rbd_pool_id_show(struct device *dev,
3833 struct device_attribute *attr, char *buf)
3834{
3835 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3836
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003837 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003838 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003839}
3840
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003841static ssize_t rbd_name_show(struct device *dev,
3842 struct device_attribute *attr, char *buf)
3843{
Alex Elder593a9e72012-02-07 12:03:37 -06003844 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003845
Alex Eldera92ffdf2012-10-30 19:40:33 -05003846 if (rbd_dev->spec->image_name)
3847 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3848
3849 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003850}
3851
Alex Elder589d30e2012-07-10 20:30:11 -05003852static ssize_t rbd_image_id_show(struct device *dev,
3853 struct device_attribute *attr, char *buf)
3854{
3855 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3856
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003857 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003858}
3859
Alex Elder34b13182012-07-13 20:35:12 -05003860/*
3861 * Shows the name of the currently-mapped snapshot (or
3862 * RBD_SNAP_HEAD_NAME for the base image).
3863 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003864static ssize_t rbd_snap_show(struct device *dev,
3865 struct device_attribute *attr,
3866 char *buf)
3867{
Alex Elder593a9e72012-02-07 12:03:37 -06003868 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003869
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003870 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003871}
3872
Alex Elder86b00e02012-10-25 23:34:42 -05003873/*
Ilya Dryomovff961282014-07-22 21:53:07 +04003874 * For a v2 image, shows the chain of parent images, separated by empty
3875 * lines. For v1 images or if there is no parent, shows "(no parent
3876 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05003877 */
3878static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04003879 struct device_attribute *attr,
3880 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05003881{
3882 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04003883 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05003884
Ilya Dryomovff961282014-07-22 21:53:07 +04003885 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05003886 return sprintf(buf, "(no parent image)\n");
3887
Ilya Dryomovff961282014-07-22 21:53:07 +04003888 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
3889 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05003890
Ilya Dryomovff961282014-07-22 21:53:07 +04003891 count += sprintf(&buf[count], "%s"
3892 "pool_id %llu\npool_name %s\n"
3893 "image_id %s\nimage_name %s\n"
3894 "snap_id %llu\nsnap_name %s\n"
3895 "overlap %llu\n",
3896 !count ? "" : "\n", /* first? */
3897 spec->pool_id, spec->pool_name,
3898 spec->image_id, spec->image_name ?: "(unknown)",
3899 spec->snap_id, spec->snap_name,
3900 rbd_dev->parent_overlap);
3901 }
Alex Elder86b00e02012-10-25 23:34:42 -05003902
Ilya Dryomovff961282014-07-22 21:53:07 +04003903 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05003904}
3905
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003906static ssize_t rbd_image_refresh(struct device *dev,
3907 struct device_attribute *attr,
3908 const char *buf,
3909 size_t size)
3910{
Alex Elder593a9e72012-02-07 12:03:37 -06003911 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003912 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003913
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003914 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05003915 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003916 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05003917
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003918 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003919}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003920
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003921static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003922static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003923static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003924static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003925static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3926static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003927static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003928static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003929static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003930static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3931static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003932static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003933
3934static struct attribute *rbd_attrs[] = {
3935 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003936 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003937 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003938 &dev_attr_minor.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003939 &dev_attr_client_id.attr,
3940 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05003941 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003942 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05003943 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003944 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05003945 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003946 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003947 NULL
3948};
3949
3950static struct attribute_group rbd_attr_group = {
3951 .attrs = rbd_attrs,
3952};
3953
3954static const struct attribute_group *rbd_attr_groups[] = {
3955 &rbd_attr_group,
3956 NULL
3957};
3958
3959static void rbd_sysfs_dev_release(struct device *dev)
3960{
3961}
3962
3963static struct device_type rbd_device_type = {
3964 .name = "rbd",
3965 .groups = rbd_attr_groups,
3966 .release = rbd_sysfs_dev_release,
3967};
3968
Alex Elder8b8fb992012-10-26 17:25:24 -05003969static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3970{
3971 kref_get(&spec->kref);
3972
3973 return spec;
3974}
3975
3976static void rbd_spec_free(struct kref *kref);
3977static void rbd_spec_put(struct rbd_spec *spec)
3978{
3979 if (spec)
3980 kref_put(&spec->kref, rbd_spec_free);
3981}
3982
3983static struct rbd_spec *rbd_spec_alloc(void)
3984{
3985 struct rbd_spec *spec;
3986
3987 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3988 if (!spec)
3989 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04003990
3991 spec->pool_id = CEPH_NOPOOL;
3992 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05003993 kref_init(&spec->kref);
3994
Alex Elder8b8fb992012-10-26 17:25:24 -05003995 return spec;
3996}
3997
3998static void rbd_spec_free(struct kref *kref)
3999{
4000 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4001
4002 kfree(spec->pool_name);
4003 kfree(spec->image_id);
4004 kfree(spec->image_name);
4005 kfree(spec->snap_name);
4006 kfree(spec);
4007}
4008
Alex Eldercc344fa2013-02-19 12:25:56 -06004009static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05004010 struct rbd_spec *spec)
4011{
4012 struct rbd_device *rbd_dev;
4013
4014 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
4015 if (!rbd_dev)
4016 return NULL;
4017
4018 spin_lock_init(&rbd_dev->lock);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004019 INIT_LIST_HEAD(&rbd_dev->rq_queue);
4020 INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
Alex Elder6d292902013-01-14 12:43:31 -06004021 rbd_dev->flags = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05004022 atomic_set(&rbd_dev->parent_ref, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05004023 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004024 init_rwsem(&rbd_dev->header_rwsem);
4025
4026 rbd_dev->spec = spec;
4027 rbd_dev->rbd_client = rbdc;
4028
Alex Elder0903e872012-11-14 12:25:19 -06004029 /* Initialize the layout used for all rbd requests */
4030
4031 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
4032 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
4033 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
4034 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
4035
Alex Elderc53d5892012-10-25 23:34:42 -05004036 return rbd_dev;
4037}
4038
4039static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4040{
Alex Elderc53d5892012-10-25 23:34:42 -05004041 rbd_put_client(rbd_dev->rbd_client);
4042 rbd_spec_put(rbd_dev->spec);
4043 kfree(rbd_dev);
4044}
4045
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004046/*
Alex Elder9d475de2012-07-03 16:01:19 -05004047 * Get the size and object order for an image snapshot, or if
4048 * snap_id is CEPH_NOSNAP, gets this information for the base
4049 * image.
4050 */
4051static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4052 u8 *order, u64 *snap_size)
4053{
4054 __le64 snapid = cpu_to_le64(snap_id);
4055 int ret;
4056 struct {
4057 u8 order;
4058 __le64 size;
4059 } __attribute__ ((packed)) size_buf = { 0 };
4060
Alex Elder36be9a72013-01-19 00:30:28 -06004061 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05004062 "rbd", "get_size",
Alex Elder41579762013-04-21 12:14:45 -05004063 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05004064 &size_buf, sizeof (size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004065 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004066 if (ret < 0)
4067 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004068 if (ret < sizeof (size_buf))
4069 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004070
Josh Durginc3545572013-08-28 17:08:10 -07004071 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004072 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004073 dout(" order %u", (unsigned int)*order);
4074 }
Alex Elder9d475de2012-07-03 16:01:19 -05004075 *snap_size = le64_to_cpu(size_buf.size);
4076
Josh Durginc3545572013-08-28 17:08:10 -07004077 dout(" snap_id 0x%016llx snap_size = %llu\n",
4078 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004079 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004080
4081 return 0;
4082}
4083
4084static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4085{
4086 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4087 &rbd_dev->header.obj_order,
4088 &rbd_dev->header.image_size);
4089}
4090
Alex Elder1e130192012-07-03 16:01:19 -05004091static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4092{
4093 void *reply_buf;
4094 int ret;
4095 void *p;
4096
4097 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4098 if (!reply_buf)
4099 return -ENOMEM;
4100
Alex Elder36be9a72013-01-19 00:30:28 -06004101 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05004102 "rbd", "get_object_prefix", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004103 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004104 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004105 if (ret < 0)
4106 goto out;
4107
4108 p = reply_buf;
4109 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004110 p + ret, NULL, GFP_NOIO);
4111 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004112
4113 if (IS_ERR(rbd_dev->header.object_prefix)) {
4114 ret = PTR_ERR(rbd_dev->header.object_prefix);
4115 rbd_dev->header.object_prefix = NULL;
4116 } else {
4117 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4118 }
Alex Elder1e130192012-07-03 16:01:19 -05004119out:
4120 kfree(reply_buf);
4121
4122 return ret;
4123}
4124
Alex Elderb1b54022012-07-03 16:01:19 -05004125static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4126 u64 *snap_features)
4127{
4128 __le64 snapid = cpu_to_le64(snap_id);
4129 struct {
4130 __le64 features;
4131 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004132 } __attribute__ ((packed)) features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07004133 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05004134 int ret;
4135
Alex Elder36be9a72013-01-19 00:30:28 -06004136 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05004137 "rbd", "get_features",
Alex Elder41579762013-04-21 12:14:45 -05004138 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05004139 &features_buf, sizeof (features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004140 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004141 if (ret < 0)
4142 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004143 if (ret < sizeof (features_buf))
4144 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004145
4146 incompat = le64_to_cpu(features_buf.incompat);
Alex Elder5cbf6f122013-04-11 09:29:48 -05004147 if (incompat & ~RBD_FEATURES_SUPPORTED)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004148 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07004149
Alex Elderb1b54022012-07-03 16:01:19 -05004150 *snap_features = le64_to_cpu(features_buf.features);
4151
4152 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004153 (unsigned long long)snap_id,
4154 (unsigned long long)*snap_features,
4155 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004156
4157 return 0;
4158}
4159
4160static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4161{
4162 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4163 &rbd_dev->header.features);
4164}
4165
Alex Elder86b00e02012-10-25 23:34:42 -05004166static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4167{
4168 struct rbd_spec *parent_spec;
4169 size_t size;
4170 void *reply_buf = NULL;
4171 __le64 snapid;
4172 void *p;
4173 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05004174 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004175 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004176 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004177 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05004178 int ret;
4179
4180 parent_spec = rbd_spec_alloc();
4181 if (!parent_spec)
4182 return -ENOMEM;
4183
4184 size = sizeof (__le64) + /* pool_id */
4185 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
4186 sizeof (__le64) + /* snap_id */
4187 sizeof (__le64); /* overlap */
4188 reply_buf = kmalloc(size, GFP_KERNEL);
4189 if (!reply_buf) {
4190 ret = -ENOMEM;
4191 goto out_err;
4192 }
4193
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04004194 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Alex Elder36be9a72013-01-19 00:30:28 -06004195 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05004196 "rbd", "get_parent",
Alex Elder41579762013-04-21 12:14:45 -05004197 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05004198 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004199 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05004200 if (ret < 0)
4201 goto out_err;
4202
Alex Elder86b00e02012-10-25 23:34:42 -05004203 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004204 end = reply_buf + ret;
4205 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05004206 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05004207 if (pool_id == CEPH_NOPOOL) {
4208 /*
4209 * Either the parent never existed, or we have
4210 * record of it but the image got flattened so it no
4211 * longer has a parent. When the parent of a
4212 * layered image disappears we immediately set the
4213 * overlap to 0. The effect of this is that all new
4214 * requests will be treated as if the image had no
4215 * parent.
4216 */
4217 if (rbd_dev->parent_overlap) {
4218 rbd_dev->parent_overlap = 0;
4219 smp_mb();
4220 rbd_dev_parent_put(rbd_dev);
4221 pr_info("%s: clone image has been flattened\n",
4222 rbd_dev->disk->disk_name);
4223 }
4224
Alex Elder86b00e02012-10-25 23:34:42 -05004225 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004226 }
Alex Elder86b00e02012-10-25 23:34:42 -05004227
Alex Elder0903e872012-11-14 12:25:19 -06004228 /* The ceph file layout needs to fit pool id in 32 bits */
4229
4230 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05004231 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004232 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05004233 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004234 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004235 }
Alex Elder0903e872012-11-14 12:25:19 -06004236
Alex Elder979ed482012-11-01 08:39:26 -05004237 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05004238 if (IS_ERR(image_id)) {
4239 ret = PTR_ERR(image_id);
4240 goto out_err;
4241 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004242 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05004243 ceph_decode_64_safe(&p, end, overlap, out_err);
4244
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004245 /*
4246 * The parent won't change (except when the clone is
4247 * flattened, already handled that). So we only need to
4248 * record the parent spec we have not already done so.
4249 */
4250 if (!rbd_dev->parent_spec) {
4251 parent_spec->pool_id = pool_id;
4252 parent_spec->image_id = image_id;
4253 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05004254 rbd_dev->parent_spec = parent_spec;
4255 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04004256 } else {
4257 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004258 }
4259
4260 /*
4261 * We always update the parent overlap. If it's zero we
4262 * treat it specially.
4263 */
4264 rbd_dev->parent_overlap = overlap;
4265 smp_mb();
4266 if (!overlap) {
4267
4268 /* A null parent_spec indicates it's the initial probe */
4269
4270 if (parent_spec) {
4271 /*
4272 * The overlap has become zero, so the clone
4273 * must have been resized down to 0 at some
4274 * point. Treat this the same as a flatten.
4275 */
4276 rbd_dev_parent_put(rbd_dev);
4277 pr_info("%s: clone image now standalone\n",
4278 rbd_dev->disk->disk_name);
4279 } else {
4280 /*
4281 * For the initial probe, if we find the
4282 * overlap is zero we just pretend there was
4283 * no parent image.
4284 */
Ilya Dryomov9584d502014-07-11 12:11:20 +04004285 rbd_warn(rbd_dev, "ignoring parent with overlap 0");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004286 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004287 }
Alex Elder86b00e02012-10-25 23:34:42 -05004288out:
4289 ret = 0;
4290out_err:
4291 kfree(reply_buf);
4292 rbd_spec_put(parent_spec);
4293
4294 return ret;
4295}
4296
Alex Eldercc070d52013-04-21 12:14:45 -05004297static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4298{
4299 struct {
4300 __le64 stripe_unit;
4301 __le64 stripe_count;
4302 } __attribute__ ((packed)) striping_info_buf = { 0 };
4303 size_t size = sizeof (striping_info_buf);
4304 void *p;
4305 u64 obj_size;
4306 u64 stripe_unit;
4307 u64 stripe_count;
4308 int ret;
4309
4310 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4311 "rbd", "get_stripe_unit_count", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004312 (char *)&striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004313 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4314 if (ret < 0)
4315 return ret;
4316 if (ret < size)
4317 return -ERANGE;
4318
4319 /*
4320 * We don't actually support the "fancy striping" feature
4321 * (STRIPINGV2) yet, but if the striping sizes are the
4322 * defaults the behavior is the same as before. So find
4323 * out, and only fail if the image has non-default values.
4324 */
4325 ret = -EINVAL;
4326 obj_size = (u64)1 << rbd_dev->header.obj_order;
4327 p = &striping_info_buf;
4328 stripe_unit = ceph_decode_64(&p);
4329 if (stripe_unit != obj_size) {
4330 rbd_warn(rbd_dev, "unsupported stripe unit "
4331 "(got %llu want %llu)",
4332 stripe_unit, obj_size);
4333 return -EINVAL;
4334 }
4335 stripe_count = ceph_decode_64(&p);
4336 if (stripe_count != 1) {
4337 rbd_warn(rbd_dev, "unsupported stripe count "
4338 "(got %llu want 1)", stripe_count);
4339 return -EINVAL;
4340 }
Alex Elder500d0c02013-04-26 09:43:47 -05004341 rbd_dev->header.stripe_unit = stripe_unit;
4342 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05004343
4344 return 0;
4345}
4346
Alex Elder9e15b772012-10-30 19:40:33 -05004347static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4348{
4349 size_t image_id_size;
4350 char *image_id;
4351 void *p;
4352 void *end;
4353 size_t size;
4354 void *reply_buf = NULL;
4355 size_t len = 0;
4356 char *image_name = NULL;
4357 int ret;
4358
4359 rbd_assert(!rbd_dev->spec->image_name);
4360
Alex Elder69e7a022012-11-01 08:39:26 -05004361 len = strlen(rbd_dev->spec->image_id);
4362 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004363 image_id = kmalloc(image_id_size, GFP_KERNEL);
4364 if (!image_id)
4365 return NULL;
4366
4367 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004368 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004369 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004370
4371 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4372 reply_buf = kmalloc(size, GFP_KERNEL);
4373 if (!reply_buf)
4374 goto out;
4375
Alex Elder36be9a72013-01-19 00:30:28 -06004376 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05004377 "rbd", "dir_get_name",
4378 image_id, image_id_size,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004379 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05004380 if (ret < 0)
4381 goto out;
4382 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004383 end = reply_buf + ret;
4384
Alex Elder9e15b772012-10-30 19:40:33 -05004385 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4386 if (IS_ERR(image_name))
4387 image_name = NULL;
4388 else
4389 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4390out:
4391 kfree(reply_buf);
4392 kfree(image_id);
4393
4394 return image_name;
4395}
4396
Alex Elder2ad3d712013-04-30 00:44:33 -05004397static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4398{
4399 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4400 const char *snap_name;
4401 u32 which = 0;
4402
4403 /* Skip over names until we find the one we are looking for */
4404
4405 snap_name = rbd_dev->header.snap_names;
4406 while (which < snapc->num_snaps) {
4407 if (!strcmp(name, snap_name))
4408 return snapc->snaps[which];
4409 snap_name += strlen(snap_name) + 1;
4410 which++;
4411 }
4412 return CEPH_NOSNAP;
4413}
4414
4415static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4416{
4417 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4418 u32 which;
4419 bool found = false;
4420 u64 snap_id;
4421
4422 for (which = 0; !found && which < snapc->num_snaps; which++) {
4423 const char *snap_name;
4424
4425 snap_id = snapc->snaps[which];
4426 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07004427 if (IS_ERR(snap_name)) {
4428 /* ignore no-longer existing snapshots */
4429 if (PTR_ERR(snap_name) == -ENOENT)
4430 continue;
4431 else
4432 break;
4433 }
Alex Elder2ad3d712013-04-30 00:44:33 -05004434 found = !strcmp(name, snap_name);
4435 kfree(snap_name);
4436 }
4437 return found ? snap_id : CEPH_NOSNAP;
4438}
4439
4440/*
4441 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4442 * no snapshot by that name is found, or if an error occurs.
4443 */
4444static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4445{
4446 if (rbd_dev->image_format == 1)
4447 return rbd_v1_snap_id_by_name(rbd_dev, name);
4448
4449 return rbd_v2_snap_id_by_name(rbd_dev, name);
4450}
4451
Alex Elder9e15b772012-10-30 19:40:33 -05004452/*
Ilya Dryomov04077592014-07-23 17:11:20 +04004453 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05004454 */
Ilya Dryomov04077592014-07-23 17:11:20 +04004455static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
4456{
4457 struct rbd_spec *spec = rbd_dev->spec;
4458
4459 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
4460 rbd_assert(spec->image_id && spec->image_name);
4461 rbd_assert(spec->snap_name);
4462
4463 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4464 u64 snap_id;
4465
4466 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4467 if (snap_id == CEPH_NOSNAP)
4468 return -ENOENT;
4469
4470 spec->snap_id = snap_id;
4471 } else {
4472 spec->snap_id = CEPH_NOSNAP;
4473 }
4474
4475 return 0;
4476}
4477
4478/*
4479 * A parent image will have all ids but none of the names.
4480 *
4481 * All names in an rbd spec are dynamically allocated. It's OK if we
4482 * can't figure out the name for an image id.
4483 */
4484static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05004485{
Alex Elder2e9f7f12013-04-26 09:43:48 -05004486 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4487 struct rbd_spec *spec = rbd_dev->spec;
4488 const char *pool_name;
4489 const char *image_name;
4490 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004491 int ret;
4492
Ilya Dryomov04077592014-07-23 17:11:20 +04004493 rbd_assert(spec->pool_id != CEPH_NOPOOL);
4494 rbd_assert(spec->image_id);
4495 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05004496
Alex Elder2e9f7f12013-04-26 09:43:48 -05004497 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05004498
Alex Elder2e9f7f12013-04-26 09:43:48 -05004499 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4500 if (!pool_name) {
4501 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05004502 return -EIO;
4503 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05004504 pool_name = kstrdup(pool_name, GFP_KERNEL);
4505 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05004506 return -ENOMEM;
4507
4508 /* Fetch the image name; tolerate failure here */
4509
Alex Elder2e9f7f12013-04-26 09:43:48 -05004510 image_name = rbd_dev_image_name(rbd_dev);
4511 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05004512 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05004513
Ilya Dryomov04077592014-07-23 17:11:20 +04004514 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05004515
Alex Elder2e9f7f12013-04-26 09:43:48 -05004516 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07004517 if (IS_ERR(snap_name)) {
4518 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004519 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05004520 }
4521
4522 spec->pool_name = pool_name;
4523 spec->image_name = image_name;
4524 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004525
4526 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04004527
Alex Elder9e15b772012-10-30 19:40:33 -05004528out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05004529 kfree(image_name);
4530 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004531 return ret;
4532}
4533
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004534static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05004535{
4536 size_t size;
4537 int ret;
4538 void *reply_buf;
4539 void *p;
4540 void *end;
4541 u64 seq;
4542 u32 snap_count;
4543 struct ceph_snap_context *snapc;
4544 u32 i;
4545
4546 /*
4547 * We'll need room for the seq value (maximum snapshot id),
4548 * snapshot count, and array of that many snapshot ids.
4549 * For now we have a fixed upper limit on the number we're
4550 * prepared to receive.
4551 */
4552 size = sizeof (__le64) + sizeof (__le32) +
4553 RBD_MAX_SNAP_COUNT * sizeof (__le64);
4554 reply_buf = kzalloc(size, GFP_KERNEL);
4555 if (!reply_buf)
4556 return -ENOMEM;
4557
Alex Elder36be9a72013-01-19 00:30:28 -06004558 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05004559 "rbd", "get_snapcontext", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004560 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004561 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05004562 if (ret < 0)
4563 goto out;
4564
Alex Elder35d489f2012-07-03 16:01:19 -05004565 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004566 end = reply_buf + ret;
4567 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05004568 ceph_decode_64_safe(&p, end, seq, out);
4569 ceph_decode_32_safe(&p, end, snap_count, out);
4570
4571 /*
4572 * Make sure the reported number of snapshot ids wouldn't go
4573 * beyond the end of our buffer. But before checking that,
4574 * make sure the computed size of the snapshot context we
4575 * allocate is representable in a size_t.
4576 */
4577 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4578 / sizeof (u64)) {
4579 ret = -EINVAL;
4580 goto out;
4581 }
4582 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4583 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05004584 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05004585
Alex Elder812164f82013-04-30 00:44:32 -05004586 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05004587 if (!snapc) {
4588 ret = -ENOMEM;
4589 goto out;
4590 }
Alex Elder35d489f2012-07-03 16:01:19 -05004591 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05004592 for (i = 0; i < snap_count; i++)
4593 snapc->snaps[i] = ceph_decode_64(&p);
4594
Alex Elder49ece552013-05-06 08:37:00 -05004595 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05004596 rbd_dev->header.snapc = snapc;
4597
4598 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05004599 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05004600out:
4601 kfree(reply_buf);
4602
Alex Elder57385b52013-04-21 12:14:45 -05004603 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05004604}
4605
Alex Elder54cac612013-04-30 00:44:33 -05004606static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4607 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004608{
4609 size_t size;
4610 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05004611 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004612 int ret;
4613 void *p;
4614 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004615 char *snap_name;
4616
4617 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4618 reply_buf = kmalloc(size, GFP_KERNEL);
4619 if (!reply_buf)
4620 return ERR_PTR(-ENOMEM);
4621
Alex Elder54cac612013-04-30 00:44:33 -05004622 snapid = cpu_to_le64(snap_id);
Alex Elder36be9a72013-01-19 00:30:28 -06004623 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004624 "rbd", "get_snapshot_name",
Alex Elder54cac612013-04-30 00:44:33 -05004625 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05004626 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004627 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05004628 if (ret < 0) {
4629 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004630 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05004631 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004632
4633 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004634 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05004635 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05004636 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004637 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004638
Alex Elderf40eb342013-04-25 15:09:42 -05004639 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05004640 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004641out:
4642 kfree(reply_buf);
4643
Alex Elderf40eb342013-04-25 15:09:42 -05004644 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004645}
4646
Alex Elder2df3fac2013-05-06 09:51:30 -05004647static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05004648{
Alex Elder2df3fac2013-05-06 09:51:30 -05004649 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05004650 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05004651
Josh Durgin1617e402013-06-12 14:43:10 -07004652 ret = rbd_dev_v2_image_size(rbd_dev);
4653 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05004654 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07004655
Alex Elder2df3fac2013-05-06 09:51:30 -05004656 if (first_time) {
4657 ret = rbd_dev_v2_header_onetime(rbd_dev);
4658 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05004659 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05004660 }
4661
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004662 ret = rbd_dev_v2_snap_context(rbd_dev);
Alex Elder117973f2012-08-31 17:29:55 -05004663 dout("rbd_dev_v2_snap_context returned %d\n", ret);
Alex Elder117973f2012-08-31 17:29:55 -05004664
4665 return ret;
4666}
4667
Ilya Dryomova720ae02014-07-23 17:11:19 +04004668static int rbd_dev_header_info(struct rbd_device *rbd_dev)
4669{
4670 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4671
4672 if (rbd_dev->image_format == 1)
4673 return rbd_dev_v1_header_info(rbd_dev);
4674
4675 return rbd_dev_v2_header_info(rbd_dev);
4676}
4677
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004678static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4679{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004680 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05004681 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004682
Alex Eldercd789ab2012-08-30 00:16:38 -05004683 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004684 dev->bus = &rbd_bus_type;
4685 dev->type = &rbd_device_type;
4686 dev->parent = &rbd_root_dev;
Alex Elder200a6a82013-04-28 23:32:34 -05004687 dev->release = rbd_dev_device_release;
Alex Elderde71a292012-07-03 16:01:19 -05004688 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004689 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004690
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004691 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004692}
4693
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004694static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4695{
4696 device_unregister(&rbd_dev->dev);
4697}
4698
Alex Elder1ddbe942012-01-29 13:57:44 -06004699/*
Alex Elder499afd52012-02-02 08:13:29 -06004700 * Get a unique rbd identifier for the given new rbd_dev, and add
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +02004701 * the rbd_dev to the global list.
Alex Elder1ddbe942012-01-29 13:57:44 -06004702 */
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +02004703static int rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06004704{
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +02004705 int new_dev_id;
4706
Ilya Dryomov9b60e702013-12-13 15:28:57 +02004707 new_dev_id = ida_simple_get(&rbd_dev_id_ida,
4708 0, minor_to_rbd_dev_id(1 << MINORBITS),
4709 GFP_KERNEL);
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +02004710 if (new_dev_id < 0)
4711 return new_dev_id;
4712
4713 rbd_dev->dev_id = new_dev_id;
Alex Elder499afd52012-02-02 08:13:29 -06004714
4715 spin_lock(&rbd_dev_list_lock);
4716 list_add_tail(&rbd_dev->node, &rbd_dev_list);
4717 spin_unlock(&rbd_dev_list_lock);
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +02004718
Ilya Dryomov70eebd22013-12-13 15:28:56 +02004719 dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +02004720
4721 return 0;
Alex Elder1ddbe942012-01-29 13:57:44 -06004722}
Alex Elderb7f23c32012-01-29 13:57:43 -06004723
Alex Elder1ddbe942012-01-29 13:57:44 -06004724/*
Alex Elder499afd52012-02-02 08:13:29 -06004725 * Remove an rbd_dev from the global list, and record that its
4726 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06004727 */
Alex Eldere2839302012-08-29 17:11:06 -05004728static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06004729{
Alex Elder499afd52012-02-02 08:13:29 -06004730 spin_lock(&rbd_dev_list_lock);
4731 list_del_init(&rbd_dev->node);
4732 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06004733
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +02004734 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4735
4736 dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06004737}
4738
Alex Eldera725f65e2012-02-02 08:13:30 -06004739/*
Alex Eldere28fff262012-02-02 08:13:30 -06004740 * Skips over white space at *buf, and updates *buf to point to the
4741 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004742 * the token (string of non-white space characters) found. Note
4743 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004744 */
4745static inline size_t next_token(const char **buf)
4746{
4747 /*
4748 * These are the characters that produce nonzero for
4749 * isspace() in the "C" and "POSIX" locales.
4750 */
4751 const char *spaces = " \f\n\r\t\v";
4752
4753 *buf += strspn(*buf, spaces); /* Find start of token */
4754
4755 return strcspn(*buf, spaces); /* Return token length */
4756}
4757
4758/*
4759 * Finds the next token in *buf, and if the provided token buffer is
4760 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06004761 * copied, is guaranteed to be terminated with '\0'. Note that *buf
4762 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06004763 *
4764 * Returns the length of the token found (not including the '\0').
4765 * Return value will be 0 if no token is found, and it will be >=
4766 * token_size if the token would not fit.
4767 *
Alex Elder593a9e72012-02-07 12:03:37 -06004768 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06004769 * found token. Note that this occurs even if the token buffer is
4770 * too small to hold it.
4771 */
4772static inline size_t copy_token(const char **buf,
4773 char *token,
4774 size_t token_size)
4775{
4776 size_t len;
4777
4778 len = next_token(buf);
4779 if (len < token_size) {
4780 memcpy(token, *buf, len);
4781 *(token + len) = '\0';
4782 }
4783 *buf += len;
4784
4785 return len;
4786}
4787
4788/*
Alex Elderea3352f2012-07-09 21:04:23 -05004789 * Finds the next token in *buf, dynamically allocates a buffer big
4790 * enough to hold a copy of it, and copies the token into the new
4791 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4792 * that a duplicate buffer is created even for a zero-length token.
4793 *
4794 * Returns a pointer to the newly-allocated duplicate, or a null
4795 * pointer if memory for the duplicate was not available. If
4796 * the lenp argument is a non-null pointer, the length of the token
4797 * (not including the '\0') is returned in *lenp.
4798 *
4799 * If successful, the *buf pointer will be updated to point beyond
4800 * the end of the found token.
4801 *
4802 * Note: uses GFP_KERNEL for allocation.
4803 */
4804static inline char *dup_token(const char **buf, size_t *lenp)
4805{
4806 char *dup;
4807 size_t len;
4808
4809 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004810 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004811 if (!dup)
4812 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004813 *(dup + len) = '\0';
4814 *buf += len;
4815
4816 if (lenp)
4817 *lenp = len;
4818
4819 return dup;
4820}
4821
4822/*
Alex Elder859c31d2012-10-25 23:34:42 -05004823 * Parse the options provided for an "rbd add" (i.e., rbd image
4824 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4825 * and the data written is passed here via a NUL-terminated buffer.
4826 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004827 *
Alex Elder859c31d2012-10-25 23:34:42 -05004828 * The information extracted from these options is recorded in
4829 * the other parameters which return dynamically-allocated
4830 * structures:
4831 * ceph_opts
4832 * The address of a pointer that will refer to a ceph options
4833 * structure. Caller must release the returned pointer using
4834 * ceph_destroy_options() when it is no longer needed.
4835 * rbd_opts
4836 * Address of an rbd options pointer. Fully initialized by
4837 * this function; caller must release with kfree().
4838 * spec
4839 * Address of an rbd image specification pointer. Fully
4840 * initialized by this function based on parsed options.
4841 * Caller must release with rbd_spec_put().
4842 *
4843 * The options passed take this form:
4844 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4845 * where:
4846 * <mon_addrs>
4847 * A comma-separated list of one or more monitor addresses.
4848 * A monitor address is an ip address, optionally followed
4849 * by a port number (separated by a colon).
4850 * I.e.: ip1[:port1][,ip2[:port2]...]
4851 * <options>
4852 * A comma-separated list of ceph and/or rbd options.
4853 * <pool_name>
4854 * The name of the rados pool containing the rbd image.
4855 * <image_name>
4856 * The name of the image in that pool to map.
4857 * <snap_id>
4858 * An optional snapshot id. If provided, the mapping will
4859 * present data from the image at the time that snapshot was
4860 * created. The image head is used if no snapshot id is
4861 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004862 */
Alex Elder859c31d2012-10-25 23:34:42 -05004863static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004864 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004865 struct rbd_options **opts,
4866 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004867{
Alex Elderd22f76e2012-07-12 10:46:35 -05004868 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004869 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004870 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05004871 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05004872 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004873 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004874 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004875 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004876 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004877
4878 /* The first four tokens are required */
4879
Alex Elder7ef32142012-02-02 08:13:30 -06004880 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004881 if (!len) {
4882 rbd_warn(NULL, "no monitor address(es) provided");
4883 return -EINVAL;
4884 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004885 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004886 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004887 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004888
Alex Elderdc79b112012-10-25 23:34:41 -05004889 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004890 options = dup_token(&buf, NULL);
4891 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004892 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004893 if (!*options) {
4894 rbd_warn(NULL, "no options provided");
4895 goto out_err;
4896 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004897
Alex Elder859c31d2012-10-25 23:34:42 -05004898 spec = rbd_spec_alloc();
4899 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004900 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004901
4902 spec->pool_name = dup_token(&buf, NULL);
4903 if (!spec->pool_name)
4904 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004905 if (!*spec->pool_name) {
4906 rbd_warn(NULL, "no pool name provided");
4907 goto out_err;
4908 }
Alex Eldere28fff262012-02-02 08:13:30 -06004909
Alex Elder69e7a022012-11-01 08:39:26 -05004910 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004911 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004912 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004913 if (!*spec->image_name) {
4914 rbd_warn(NULL, "no image name provided");
4915 goto out_err;
4916 }
Alex Eldere28fff262012-02-02 08:13:30 -06004917
Alex Elderf28e5652012-10-25 23:34:41 -05004918 /*
4919 * Snapshot name is optional; default is to use "-"
4920 * (indicating the head/no snapshot).
4921 */
Alex Elder3feeb8942012-08-31 17:29:52 -05004922 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05004923 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05004924 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4925 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05004926 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05004927 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05004928 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05004929 }
Alex Elderecb4dc22013-04-26 09:43:47 -05004930 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4931 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004932 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05004933 *(snap_name + len) = '\0';
4934 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05004935
Alex Elder0ddebc02012-10-25 23:34:41 -05004936 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06004937
Alex Elder4e9afeb2012-10-25 23:34:41 -05004938 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4939 if (!rbd_opts)
4940 goto out_mem;
4941
4942 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05004943
Alex Elder859c31d2012-10-25 23:34:42 -05004944 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05004945 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05004946 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004947 if (IS_ERR(copts)) {
4948 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05004949 goto out_err;
4950 }
Alex Elder859c31d2012-10-25 23:34:42 -05004951 kfree(options);
4952
4953 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004954 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05004955 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05004956
Alex Elderdc79b112012-10-25 23:34:41 -05004957 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05004958out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05004959 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05004960out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05004961 kfree(rbd_opts);
4962 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05004963 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05004964
Alex Elderdc79b112012-10-25 23:34:41 -05004965 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06004966}
4967
Alex Elder589d30e2012-07-10 20:30:11 -05004968/*
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04004969 * Return pool id (>= 0) or a negative error code.
4970 */
4971static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
4972{
4973 u64 newest_epoch;
4974 unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
4975 int tries = 0;
4976 int ret;
4977
4978again:
4979 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
4980 if (ret == -ENOENT && tries++ < 1) {
4981 ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
4982 &newest_epoch);
4983 if (ret < 0)
4984 return ret;
4985
4986 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
4987 ceph_monc_request_next_osdmap(&rbdc->client->monc);
4988 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
4989 newest_epoch, timeout);
4990 goto again;
4991 } else {
4992 /* the osdmap we have is new enough */
4993 return -ENOENT;
4994 }
4995 }
4996
4997 return ret;
4998}
4999
5000/*
Alex Elder589d30e2012-07-10 20:30:11 -05005001 * An rbd format 2 image has a unique identifier, distinct from the
5002 * name given to it by the user. Internally, that identifier is
5003 * what's used to specify the names of objects related to the image.
5004 *
5005 * A special "rbd id" object is used to map an rbd image name to its
5006 * id. If that object doesn't exist, then there is no v2 rbd image
5007 * with the supplied name.
5008 *
5009 * This function will record the given rbd_dev's image_id field if
5010 * it can be determined, and in that case will return 0. If any
5011 * errors occur a negative errno will be returned and the rbd_dev's
5012 * image_id field will be unchanged (and should be NULL).
5013 */
5014static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5015{
5016 int ret;
5017 size_t size;
5018 char *object_name;
5019 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005020 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005021
Alex Elder589d30e2012-07-10 20:30:11 -05005022 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005023 * When probing a parent image, the image id is already
5024 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005025 * need to fetch the image id again in this case. We
5026 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005027 */
Alex Elderc0fba362013-04-25 23:15:08 -05005028 if (rbd_dev->spec->image_id) {
5029 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5030
Alex Elder2c0d0a12012-10-30 19:40:33 -05005031 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005032 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005033
5034 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005035 * First, see if the format 2 image id file exists, and if
5036 * so, get the image's persistent id from it.
5037 */
Alex Elder69e7a022012-11-01 08:39:26 -05005038 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05005039 object_name = kmalloc(size, GFP_NOIO);
5040 if (!object_name)
5041 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05005042 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05005043 dout("rbd id object name is %s\n", object_name);
5044
5045 /* Response will be an encoded string, which includes a length */
5046
5047 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5048 response = kzalloc(size, GFP_NOIO);
5049 if (!response) {
5050 ret = -ENOMEM;
5051 goto out;
5052 }
5053
Alex Elderc0fba362013-04-25 23:15:08 -05005054 /* If it doesn't exist we'll assume it's a format 1 image */
5055
Alex Elder36be9a72013-01-19 00:30:28 -06005056 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder41579762013-04-21 12:14:45 -05005057 "rbd", "get_id", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05005058 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005059 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005060 if (ret == -ENOENT) {
5061 image_id = kstrdup("", GFP_KERNEL);
5062 ret = image_id ? 0 : -ENOMEM;
5063 if (!ret)
5064 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005065 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005066 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005067
Alex Elderc0fba362013-04-25 23:15:08 -05005068 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005069 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005070 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005071 if (!ret)
5072 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005073 }
5074
5075 if (!ret) {
5076 rbd_dev->spec->image_id = image_id;
5077 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005078 }
5079out:
5080 kfree(response);
5081 kfree(object_name);
5082
5083 return ret;
5084}
5085
Alex Elder3abef3b2013-05-13 20:35:37 -05005086/*
5087 * Undo whatever state changes are made by v1 or v2 header info
5088 * call.
5089 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005090static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5091{
5092 struct rbd_image_header *header;
5093
Alex Elder392a9da2013-05-06 17:40:33 -05005094 /* Drop parent reference unless it's already been done (or none) */
5095
5096 if (rbd_dev->parent_overlap)
5097 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005098
5099 /* Free dynamic fields from the header, then zero it out */
5100
5101 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005102 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005103 kfree(header->snap_sizes);
5104 kfree(header->snap_names);
5105 kfree(header->object_prefix);
5106 memset(header, 0, sizeof (*header));
5107}
5108
Alex Elder2df3fac2013-05-06 09:51:30 -05005109static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005110{
5111 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005112
Alex Elder1e130192012-07-03 16:01:19 -05005113 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005114 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005115 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005116
Alex Elder2df3fac2013-05-06 09:51:30 -05005117 /*
5118 * Get the and check features for the image. Currently the
5119 * features are assumed to never change.
5120 */
Alex Elderb1b54022012-07-03 16:01:19 -05005121 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005122 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005123 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005124
Alex Eldercc070d52013-04-21 12:14:45 -05005125 /* If the image supports fancy striping, get its parameters */
5126
5127 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5128 ret = rbd_dev_v2_striping_info(rbd_dev);
5129 if (ret < 0)
5130 goto out_err;
5131 }
Alex Elder2df3fac2013-05-06 09:51:30 -05005132 /* No support for crypto and compression type format 2 images */
Alex Eldera30b71b2012-07-10 20:30:11 -05005133
Alex Elder35152972012-08-31 17:29:55 -05005134 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05005135out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005136 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005137 kfree(rbd_dev->header.object_prefix);
5138 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005139
5140 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005141}
5142
Alex Elder124afba2013-04-26 15:44:36 -05005143static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
Alex Elder83a06262012-10-30 15:47:17 -05005144{
Alex Elder2f82ee52012-10-30 19:40:33 -05005145 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005146 struct rbd_spec *parent_spec;
5147 struct rbd_client *rbdc;
5148 int ret;
5149
5150 if (!rbd_dev->parent_spec)
5151 return 0;
5152 /*
5153 * We need to pass a reference to the client and the parent
5154 * spec when creating the parent rbd_dev. Images related by
5155 * parent/child relationships always share both.
5156 */
5157 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
5158 rbdc = __rbd_get_client(rbd_dev->rbd_client);
5159
5160 ret = -ENOMEM;
5161 parent = rbd_dev_create(rbdc, parent_spec);
5162 if (!parent)
5163 goto out_err;
5164
Alex Elder1f3ef782013-05-06 17:40:33 -05005165 ret = rbd_dev_image_probe(parent, false);
Alex Elder124afba2013-04-26 15:44:36 -05005166 if (ret < 0)
5167 goto out_err;
5168 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005169 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005170
5171 return 0;
5172out_err:
5173 if (parent) {
Alex Elderfb65d2282013-05-08 22:50:04 -05005174 rbd_dev_unparent(rbd_dev);
Alex Elder124afba2013-04-26 15:44:36 -05005175 kfree(rbd_dev->header_name);
5176 rbd_dev_destroy(parent);
5177 } else {
5178 rbd_put_client(rbdc);
5179 rbd_spec_put(parent_spec);
5180 }
5181
5182 return ret;
5183}
5184
Alex Elder200a6a82013-04-28 23:32:34 -05005185static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005186{
Alex Elder83a06262012-10-30 15:47:17 -05005187 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005188
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +02005189 /* Get an id and fill in device name. */
Alex Elder83a06262012-10-30 15:47:17 -05005190
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +02005191 ret = rbd_dev_id_get(rbd_dev);
5192 if (ret)
5193 return ret;
5194
Alex Elder83a06262012-10-30 15:47:17 -05005195 BUILD_BUG_ON(DEV_NAME_LEN
5196 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
5197 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
5198
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005199 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005200
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005201 if (!single_major) {
5202 ret = register_blkdev(0, rbd_dev->name);
5203 if (ret < 0)
5204 goto err_out_id;
5205
5206 rbd_dev->major = ret;
5207 rbd_dev->minor = 0;
5208 } else {
5209 rbd_dev->major = rbd_major;
5210 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5211 }
Alex Elder83a06262012-10-30 15:47:17 -05005212
5213 /* Set up the blkdev mapping. */
5214
5215 ret = rbd_init_disk(rbd_dev);
5216 if (ret)
5217 goto err_out_blkdev;
5218
Alex Elderf35a4de2013-05-06 09:51:29 -05005219 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005220 if (ret)
5221 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005222
Alex Elderf35a4de2013-05-06 09:51:29 -05005223 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Josh Durgin22001f62013-09-30 20:10:04 -07005224 set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005225
Ilya Dryomov58d13622014-08-12 11:22:07 +04005226 rbd_dev->rq_wq = alloc_workqueue("%s", 0, 0, rbd_dev->disk->disk_name);
Wei Yongjun255939e2014-08-13 20:49:52 -07005227 if (!rbd_dev->rq_wq) {
5228 ret = -ENOMEM;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005229 goto err_out_mapping;
Wei Yongjun255939e2014-08-13 20:49:52 -07005230 }
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005231
Alex Elderf35a4de2013-05-06 09:51:29 -05005232 ret = rbd_bus_add_dev(rbd_dev);
5233 if (ret)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005234 goto err_out_workqueue;
Alex Elder83a06262012-10-30 15:47:17 -05005235
Alex Elder83a06262012-10-30 15:47:17 -05005236 /* Everything's ready. Announce the disk to the world. */
5237
Alex Elder129b79d2013-04-26 15:44:36 -05005238 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder83a06262012-10-30 15:47:17 -05005239 add_disk(rbd_dev->disk);
5240
5241 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
5242 (unsigned long long) rbd_dev->mapping.size);
5243
5244 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05005245
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005246err_out_workqueue:
5247 destroy_workqueue(rbd_dev->rq_wq);
5248 rbd_dev->rq_wq = NULL;
Alex Elderf35a4de2013-05-06 09:51:29 -05005249err_out_mapping:
5250 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005251err_out_disk:
5252 rbd_free_disk(rbd_dev);
5253err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005254 if (!single_major)
5255 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder83a06262012-10-30 15:47:17 -05005256err_out_id:
5257 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05005258 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005259
5260 return ret;
5261}
5262
Alex Elder332bb122013-04-27 09:59:30 -05005263static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5264{
5265 struct rbd_spec *spec = rbd_dev->spec;
5266 size_t size;
5267
5268 /* Record the header object name for this rbd image. */
5269
5270 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5271
5272 if (rbd_dev->image_format == 1)
5273 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
5274 else
5275 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
5276
5277 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
5278 if (!rbd_dev->header_name)
5279 return -ENOMEM;
5280
5281 if (rbd_dev->image_format == 1)
5282 sprintf(rbd_dev->header_name, "%s%s",
5283 spec->image_name, RBD_SUFFIX);
5284 else
5285 sprintf(rbd_dev->header_name, "%s%s",
5286 RBD_HEADER_PREFIX, spec->image_id);
5287 return 0;
5288}
5289
Alex Elder200a6a82013-04-28 23:32:34 -05005290static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5291{
Alex Elder6fd48b32013-04-28 23:32:34 -05005292 rbd_dev_unprobe(rbd_dev);
Alex Elder200a6a82013-04-28 23:32:34 -05005293 kfree(rbd_dev->header_name);
Alex Elder6fd48b32013-04-28 23:32:34 -05005294 rbd_dev->header_name = NULL;
5295 rbd_dev->image_format = 0;
5296 kfree(rbd_dev->spec->image_id);
5297 rbd_dev->spec->image_id = NULL;
5298
Alex Elder200a6a82013-04-28 23:32:34 -05005299 rbd_dev_destroy(rbd_dev);
5300}
5301
Alex Eldera30b71b2012-07-10 20:30:11 -05005302/*
5303 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005304 * device. If this image is the one being mapped (i.e., not a
5305 * parent), initiate a watch on its header object before using that
5306 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005307 */
Alex Elder1f3ef782013-05-06 17:40:33 -05005308static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
Alex Eldera30b71b2012-07-10 20:30:11 -05005309{
5310 int ret;
5311
5312 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005313 * Get the id from the image id object. Unless there's an
5314 * error, rbd_dev->spec->image_id will be filled in with
5315 * a dynamically-allocated string, and rbd_dev->image_format
5316 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005317 */
5318 ret = rbd_dev_image_id(rbd_dev);
5319 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005320 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005321
Alex Elder332bb122013-04-27 09:59:30 -05005322 ret = rbd_dev_header_name(rbd_dev);
5323 if (ret)
5324 goto err_out_format;
5325
Alex Elder1f3ef782013-05-06 17:40:33 -05005326 if (mapping) {
Ilya Dryomovfca27062013-12-16 18:02:40 +02005327 ret = rbd_dev_header_watch_sync(rbd_dev);
Alex Elder1f3ef782013-05-06 17:40:33 -05005328 if (ret)
5329 goto out_header_name;
5330 }
Alex Elderb644de22013-04-27 09:59:31 -05005331
Ilya Dryomova720ae02014-07-23 17:11:19 +04005332 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005333 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005334 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005335
Ilya Dryomov04077592014-07-23 17:11:20 +04005336 /*
5337 * If this image is the one being mapped, we have pool name and
5338 * id, image name and id, and snap name - need to fill snap id.
5339 * Otherwise this is a parent image, identified by pool, image
5340 * and snap ids - need to fill in names for those ids.
5341 */
5342 if (mapping)
5343 ret = rbd_spec_fill_snap_id(rbd_dev);
5344 else
5345 ret = rbd_spec_fill_names(rbd_dev);
Alex Elder9bb81c92013-04-27 09:59:30 -05005346 if (ret)
Alex Elder33dca392013-04-30 00:44:33 -05005347 goto err_out_probe;
Alex Elder9bb81c92013-04-27 09:59:30 -05005348
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005349 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5350 ret = rbd_dev_v2_parent_info(rbd_dev);
5351 if (ret)
5352 goto err_out_probe;
5353
5354 /*
5355 * Need to warn users if this image is the one being
5356 * mapped and has a parent.
5357 */
5358 if (mapping && rbd_dev->parent_spec)
5359 rbd_warn(rbd_dev,
5360 "WARNING: kernel layering is EXPERIMENTAL!");
5361 }
5362
Alex Elder9bb81c92013-04-27 09:59:30 -05005363 ret = rbd_dev_probe_parent(rbd_dev);
Alex Elder30d60ba2013-05-06 09:51:30 -05005364 if (ret)
5365 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005366
Alex Elder30d60ba2013-05-06 09:51:30 -05005367 dout("discovered format %u image, header name is %s\n",
5368 rbd_dev->image_format, rbd_dev->header_name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005369 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005370
Alex Elder6fd48b32013-04-28 23:32:34 -05005371err_out_probe:
5372 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005373err_out_watch:
Ilya Dryomovfca27062013-12-16 18:02:40 +02005374 if (mapping)
5375 rbd_dev_header_unwatch_sync(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005376out_header_name:
5377 kfree(rbd_dev->header_name);
5378 rbd_dev->header_name = NULL;
5379err_out_format:
5380 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005381 kfree(rbd_dev->spec->image_id);
5382 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005383 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005384}
5385
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005386static ssize_t do_rbd_add(struct bus_type *bus,
5387 const char *buf,
5388 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005389{
Alex Eldercb8627c2012-07-09 21:04:23 -05005390 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005391 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005392 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005393 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005394 struct rbd_client *rbdc;
Alex Elder51344a32013-05-06 07:40:30 -05005395 bool read_only;
Alex Elder27cc2592012-02-02 08:13:30 -06005396 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005397
5398 if (!try_module_get(THIS_MODULE))
5399 return -ENODEV;
5400
Alex Eldera725f65e2012-02-02 08:13:30 -06005401 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005402 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005403 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05005404 goto err_out_module;
Alex Elder51344a32013-05-06 07:40:30 -05005405 read_only = rbd_opts->read_only;
5406 kfree(rbd_opts);
5407 rbd_opts = NULL; /* done with this */
Alex Eldera725f65e2012-02-02 08:13:30 -06005408
Alex Elder9d3997f2012-10-25 23:34:42 -05005409 rbdc = rbd_get_client(ceph_opts);
5410 if (IS_ERR(rbdc)) {
5411 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005412 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005413 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005414
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005415 /* pick the pool */
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005416 rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005417 if (rc < 0)
5418 goto err_out_client;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005419 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005420
Alex Elder0903e872012-11-14 12:25:19 -06005421 /* The ceph file layout needs to fit pool id in 32 bits */
5422
Alex Elderc0cd10db2013-04-26 09:43:47 -05005423 if (spec->pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04005424 rbd_warn(NULL, "pool id too large (%llu > %u)",
Alex Elderc0cd10db2013-04-26 09:43:47 -05005425 (unsigned long long)spec->pool_id, U32_MAX);
Alex Elder0903e872012-11-14 12:25:19 -06005426 rc = -EIO;
5427 goto err_out_client;
5428 }
5429
Alex Elderc53d5892012-10-25 23:34:42 -05005430 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05005431 if (!rbd_dev)
5432 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05005433 rbdc = NULL; /* rbd_dev now owns this */
5434 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005435
Alex Elder1f3ef782013-05-06 17:40:33 -05005436 rc = rbd_dev_image_probe(rbd_dev, true);
Alex Eldera30b71b2012-07-10 20:30:11 -05005437 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05005438 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05005439
Alex Elder7ce4eef2013-05-06 17:40:33 -05005440 /* If we are mapping a snapshot it must be marked read-only */
5441
5442 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
5443 read_only = true;
5444 rbd_dev->mapping.read_only = read_only;
5445
Alex Elderb536f692013-04-28 23:32:34 -05005446 rc = rbd_dev_device_setup(rbd_dev);
Alex Elder3abef3b2013-05-13 20:35:37 -05005447 if (rc) {
Ilya Dryomove37180c2013-12-16 18:02:41 +02005448 /*
5449 * rbd_dev_header_unwatch_sync() can't be moved into
5450 * rbd_dev_image_release() without refactoring, see
5451 * commit 1f3ef78861ac.
5452 */
5453 rbd_dev_header_unwatch_sync(rbd_dev);
Alex Elder3abef3b2013-05-13 20:35:37 -05005454 rbd_dev_image_release(rbd_dev);
5455 goto err_out_module;
5456 }
Alex Elderb536f692013-04-28 23:32:34 -05005457
Alex Elder3abef3b2013-05-13 20:35:37 -05005458 return count;
5459
Alex Elderc53d5892012-10-25 23:34:42 -05005460err_out_rbd_dev:
5461 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005462err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005463 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005464err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05005465 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05005466err_out_module:
5467 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06005468
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005469 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06005470
Alex Elderc0cd10db2013-04-26 09:43:47 -05005471 return (ssize_t)rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005472}
5473
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005474static ssize_t rbd_add(struct bus_type *bus,
5475 const char *buf,
5476 size_t count)
5477{
5478 if (single_major)
5479 return -EINVAL;
5480
5481 return do_rbd_add(bus, buf, count);
5482}
5483
5484static ssize_t rbd_add_single_major(struct bus_type *bus,
5485 const char *buf,
5486 size_t count)
5487{
5488 return do_rbd_add(bus, buf, count);
5489}
5490
Alex Elder200a6a82013-04-28 23:32:34 -05005491static void rbd_dev_device_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005492{
Alex Elder593a9e72012-02-07 12:03:37 -06005493 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005494
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005495 destroy_workqueue(rbd_dev->rq_wq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005496 rbd_free_disk(rbd_dev);
Alex Elder200a6a82013-04-28 23:32:34 -05005497 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder6d80b132013-05-06 07:40:30 -05005498 rbd_dev_mapping_clear(rbd_dev);
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005499 if (!single_major)
5500 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Eldere2839302012-08-29 17:11:06 -05005501 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05005502 rbd_dev_mapping_clear(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005503}
5504
Alex Elder05a46af2013-04-26 15:44:36 -05005505static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5506{
Alex Elderad945fc2013-04-26 15:44:36 -05005507 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005508 struct rbd_device *first = rbd_dev;
5509 struct rbd_device *second = first->parent;
5510 struct rbd_device *third;
5511
5512 /*
5513 * Follow to the parent with no grandparent and
5514 * remove it.
5515 */
5516 while (second && (third = second->parent)) {
5517 first = second;
5518 second = third;
5519 }
Alex Elderad945fc2013-04-26 15:44:36 -05005520 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005521 rbd_dev_image_release(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005522 first->parent = NULL;
5523 first->parent_overlap = 0;
5524
5525 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005526 rbd_spec_put(first->parent_spec);
5527 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005528 }
5529}
5530
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005531static ssize_t do_rbd_remove(struct bus_type *bus,
5532 const char *buf,
5533 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005534{
5535 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05005536 struct list_head *tmp;
5537 int dev_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005538 unsigned long ul;
Alex Elder82a442d2013-05-31 17:40:44 -05005539 bool already = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05005540 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005541
Jingoo Hanbb8e0e82013-09-11 14:20:07 -07005542 ret = kstrtoul(buf, 10, &ul);
Alex Elder0d8189e2013-04-27 09:59:30 -05005543 if (ret)
5544 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005545
5546 /* convert to int; abort if we lost anything in the conversion */
Alex Elder751cc0e2013-05-31 15:17:01 -05005547 dev_id = (int)ul;
5548 if (dev_id != ul)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005549 return -EINVAL;
5550
Alex Elder751cc0e2013-05-31 15:17:01 -05005551 ret = -ENOENT;
5552 spin_lock(&rbd_dev_list_lock);
5553 list_for_each(tmp, &rbd_dev_list) {
5554 rbd_dev = list_entry(tmp, struct rbd_device, node);
5555 if (rbd_dev->dev_id == dev_id) {
5556 ret = 0;
5557 break;
5558 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005559 }
Alex Elder751cc0e2013-05-31 15:17:01 -05005560 if (!ret) {
5561 spin_lock_irq(&rbd_dev->lock);
5562 if (rbd_dev->open_count)
5563 ret = -EBUSY;
5564 else
Alex Elder82a442d2013-05-31 17:40:44 -05005565 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5566 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05005567 spin_unlock_irq(&rbd_dev->lock);
5568 }
5569 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05005570 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005571 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05005572
Ilya Dryomovfca27062013-12-16 18:02:40 +02005573 rbd_dev_header_unwatch_sync(rbd_dev);
Josh Durgin9abc5992013-08-29 17:31:03 -07005574 /*
5575 * flush remaining watch callbacks - these must be complete
5576 * before the osd_client is shutdown
5577 */
5578 dout("%s: flushing notifies", __func__);
5579 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02005580
Josh Durgin98752012013-08-29 17:26:31 -07005581 /*
5582 * Don't free anything from rbd_dev->disk until after all
5583 * notifies are completely processed. Otherwise
5584 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
5585 * in a potential use after free of rbd_dev->disk or rbd_dev.
5586 */
5587 rbd_bus_del_dev(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005588 rbd_dev_image_release(rbd_dev);
Alex Elder79ab7552013-04-28 23:32:34 -05005589 module_put(THIS_MODULE);
Alex Elderaafb2302012-09-06 16:00:54 -05005590
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005591 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005592}
5593
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005594static ssize_t rbd_remove(struct bus_type *bus,
5595 const char *buf,
5596 size_t count)
5597{
5598 if (single_major)
5599 return -EINVAL;
5600
5601 return do_rbd_remove(bus, buf, count);
5602}
5603
5604static ssize_t rbd_remove_single_major(struct bus_type *bus,
5605 const char *buf,
5606 size_t count)
5607{
5608 return do_rbd_remove(bus, buf, count);
5609}
5610
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005611/*
5612 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005613 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005614 */
5615static int rbd_sysfs_init(void)
5616{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005617 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005618
Alex Elderfed4c142012-02-07 12:03:36 -06005619 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005620 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005621 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005622
Alex Elderfed4c142012-02-07 12:03:36 -06005623 ret = bus_register(&rbd_bus_type);
5624 if (ret < 0)
5625 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005626
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005627 return ret;
5628}
5629
5630static void rbd_sysfs_cleanup(void)
5631{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005632 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005633 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005634}
5635
Alex Elder1c2a9df2013-05-01 12:43:03 -05005636static int rbd_slab_init(void)
5637{
5638 rbd_assert(!rbd_img_request_cache);
5639 rbd_img_request_cache = kmem_cache_create("rbd_img_request",
5640 sizeof (struct rbd_img_request),
5641 __alignof__(struct rbd_img_request),
5642 0, NULL);
Alex Elder868311b2013-05-01 12:43:03 -05005643 if (!rbd_img_request_cache)
5644 return -ENOMEM;
5645
5646 rbd_assert(!rbd_obj_request_cache);
5647 rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5648 sizeof (struct rbd_obj_request),
5649 __alignof__(struct rbd_obj_request),
5650 0, NULL);
Alex Elder78c2a442013-05-01 12:43:04 -05005651 if (!rbd_obj_request_cache)
5652 goto out_err;
5653
5654 rbd_assert(!rbd_segment_name_cache);
5655 rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
Ilya Dryomov2d0ebc52014-01-27 17:40:18 +02005656 CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
Alex Elder78c2a442013-05-01 12:43:04 -05005657 if (rbd_segment_name_cache)
Alex Elder1c2a9df2013-05-01 12:43:03 -05005658 return 0;
Alex Elder78c2a442013-05-01 12:43:04 -05005659out_err:
5660 if (rbd_obj_request_cache) {
5661 kmem_cache_destroy(rbd_obj_request_cache);
5662 rbd_obj_request_cache = NULL;
5663 }
Alex Elder1c2a9df2013-05-01 12:43:03 -05005664
Alex Elder868311b2013-05-01 12:43:03 -05005665 kmem_cache_destroy(rbd_img_request_cache);
5666 rbd_img_request_cache = NULL;
5667
Alex Elder1c2a9df2013-05-01 12:43:03 -05005668 return -ENOMEM;
5669}
5670
5671static void rbd_slab_exit(void)
5672{
Alex Elder78c2a442013-05-01 12:43:04 -05005673 rbd_assert(rbd_segment_name_cache);
5674 kmem_cache_destroy(rbd_segment_name_cache);
5675 rbd_segment_name_cache = NULL;
5676
Alex Elder868311b2013-05-01 12:43:03 -05005677 rbd_assert(rbd_obj_request_cache);
5678 kmem_cache_destroy(rbd_obj_request_cache);
5679 rbd_obj_request_cache = NULL;
5680
Alex Elder1c2a9df2013-05-01 12:43:03 -05005681 rbd_assert(rbd_img_request_cache);
5682 kmem_cache_destroy(rbd_img_request_cache);
5683 rbd_img_request_cache = NULL;
5684}
5685
Alex Eldercc344fa2013-02-19 12:25:56 -06005686static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005687{
5688 int rc;
5689
Alex Elder1e32d342013-01-30 11:13:33 -06005690 if (!libceph_compatible(NULL)) {
5691 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06005692 return -EINVAL;
5693 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005694
Alex Elder1c2a9df2013-05-01 12:43:03 -05005695 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005696 if (rc)
5697 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005698
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005699 if (single_major) {
5700 rbd_major = register_blkdev(0, RBD_DRV_NAME);
5701 if (rbd_major < 0) {
5702 rc = rbd_major;
5703 goto err_out_slab;
5704 }
5705 }
5706
Alex Elder1c2a9df2013-05-01 12:43:03 -05005707 rc = rbd_sysfs_init();
5708 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005709 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005710
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005711 if (single_major)
5712 pr_info("loaded (major %d)\n", rbd_major);
5713 else
5714 pr_info("loaded\n");
5715
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005716 return 0;
5717
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005718err_out_blkdev:
5719 if (single_major)
5720 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005721err_out_slab:
5722 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05005723 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005724}
5725
Alex Eldercc344fa2013-02-19 12:25:56 -06005726static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005727{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04005728 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005729 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005730 if (single_major)
5731 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Alex Elder1c2a9df2013-05-01 12:43:03 -05005732 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005733}
5734
5735module_init(rbd_init);
5736module_exit(rbd_exit);
5737
Alex Elderd552c612013-05-31 20:13:09 -05005738MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005739MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5740MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005741/* following authorship retained from original osdblk.c */
5742MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5743
Ilya Dryomov90da2582013-12-13 15:28:56 +02005744MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005745MODULE_LICENSE("GPL");