blob: 02e9a0f0bf7b04293e86da0758bf49759ec21736 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070036#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050037#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070038
39#include <linux/kernel.h>
40#include <linux/device.h>
41#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010042#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070043#include <linux/fs.h>
44#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050045#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020046#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040047#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070048
49#include "rbd_types.h"
50
Alex Elderaafb2302012-09-06 16:00:54 -050051#define RBD_DEBUG /* Activate rbd_assert() calls */
52
Alex Elder593a9e72012-02-07 12:03:37 -060053/*
54 * The basic unit of block I/O is a sector. It is interpreted in a
55 * number of contexts in Linux (blk, bio, genhd), but the default is
56 * universally 512 bytes. These symbols are just slightly more
57 * meaningful than the bare numbers they represent.
58 */
59#define SECTOR_SHIFT 9
60#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
61
Alex Eldera2acd002013-05-08 22:50:04 -050062/*
63 * Increment the given counter and return its updated value.
64 * If the counter is already 0 it will not be incremented.
65 * If the counter is already at its maximum value returns
66 * -EINVAL without updating it.
67 */
68static int atomic_inc_return_safe(atomic_t *v)
69{
70 unsigned int counter;
71
72 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73 if (counter <= (unsigned int)INT_MAX)
74 return (int)counter;
75
76 atomic_dec(v);
77
78 return -EINVAL;
79}
80
81/* Decrement the counter. Return the resulting value, or -EINVAL */
82static int atomic_dec_return_safe(atomic_t *v)
83{
84 int counter;
85
86 counter = atomic_dec_return(v);
87 if (counter >= 0)
88 return counter;
89
90 atomic_inc(v);
91
92 return -EINVAL;
93}
94
Alex Elderf0f8cef2012-01-29 13:57:44 -060095#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096
Ilya Dryomov7e513d42013-12-16 19:26:32 +020097#define RBD_MINORS_PER_MAJOR 256
98#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200100#define RBD_MAX_PARENT_CHAIN_LEN 16
101
Alex Elderd4b125e2012-07-03 16:01:19 -0500102#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
103#define RBD_MAX_SNAP_NAME_LEN \
104 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105
Alex Elder35d489f2012-07-03 16:01:19 -0500106#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700107
108#define RBD_SNAP_HEAD_NAME "-"
109
Alex Elder9682fc62013-04-30 00:44:33 -0500110#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
111
Alex Elder9e15b772012-10-30 19:40:33 -0500112/* This allows a single page to hold an image name sent by OSD */
113#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500114#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500115
Alex Elder1e130192012-07-03 16:01:19 -0500116#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500117
Ilya Dryomoved95b212016-08-12 16:40:02 +0200118#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200119#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
120
Alex Elderd8891402012-10-09 13:50:17 -0700121/* Feature bits */
122
Alex Elder5cbf6f122013-04-11 09:29:48 -0500123#define RBD_FEATURE_LAYERING (1<<0)
124#define RBD_FEATURE_STRIPINGV2 (1<<1)
Ilya Dryomoved95b212016-08-12 16:40:02 +0200125#define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2)
126#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
127 RBD_FEATURE_STRIPINGV2 | \
128 RBD_FEATURE_EXCLUSIVE_LOCK)
Alex Elderd8891402012-10-09 13:50:17 -0700129
130/* Features supported by this (client software) implementation. */
131
Alex Elder770eba62012-10-25 23:34:40 -0500132#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700133
Alex Elder81a89792012-02-02 08:13:30 -0600134/*
135 * An RBD device name will be "rbd#", where the "rbd" comes from
136 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600137 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700138#define DEV_NAME_LEN 32
139
140/*
141 * block device image metadata (in-memory version)
142 */
143struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500144 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500145 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146 __u8 obj_order;
147 __u8 crypt_type;
148 __u8 comp_type;
Alex Elderf35a4de2013-05-06 09:51:29 -0500149 u64 stripe_unit;
150 u64 stripe_count;
151 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700152
Alex Elderf84344f2012-08-31 17:29:51 -0500153 /* The remaining fields need to be updated occasionally */
154 u64 image_size;
155 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500156 char *snap_names; /* format 1 only */
157 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700158};
159
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500160/*
161 * An rbd image specification.
162 *
163 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500164 * identify an image. Each rbd_dev structure includes a pointer to
165 * an rbd_spec structure that encapsulates this identity.
166 *
167 * Each of the id's in an rbd_spec has an associated name. For a
168 * user-mapped image, the names are supplied and the id's associated
169 * with them are looked up. For a layered image, a parent image is
170 * defined by the tuple, and the names are looked up.
171 *
172 * An rbd_dev structure contains a parent_spec pointer which is
173 * non-null if the image it represents is a child in a layered
174 * image. This pointer will refer to the rbd_spec structure used
175 * by the parent rbd_dev for its own identity (i.e., the structure
176 * is shared between the parent and child).
177 *
178 * Since these structures are populated once, during the discovery
179 * phase of image construction, they are effectively immutable so
180 * we make no effort to synchronize access to them.
181 *
182 * Note that code herein does not assume the image name is known (it
183 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500184 */
185struct rbd_spec {
186 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500187 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500188
Alex Elderecb4dc22013-04-26 09:43:47 -0500189 const char *image_id;
190 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500191
192 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500193 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500194
195 struct kref kref;
196};
197
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600199 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700200 */
201struct rbd_client {
202 struct ceph_client *client;
203 struct kref kref;
204 struct list_head node;
205};
206
Alex Elderbf0d5f502012-11-22 00:00:08 -0600207struct rbd_img_request;
208typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
209
210#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
211
212struct rbd_obj_request;
213typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
214
Alex Elder9969ebc2013-01-18 12:31:10 -0600215enum obj_request_type {
216 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
217};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600218
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800219enum obj_operation_type {
220 OBJ_OP_WRITE,
221 OBJ_OP_READ,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800222 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800223};
224
Alex Elder926f9b32013-02-11 12:33:24 -0600225enum obj_req_flags {
226 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600227 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600228 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
229 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600230};
231
Alex Elderbf0d5f502012-11-22 00:00:08 -0600232struct rbd_obj_request {
233 const char *object_name;
234 u64 offset; /* object start byte */
235 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600236 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600237
Alex Elderc5b5ef62013-02-11 12:33:24 -0600238 /*
239 * An object request associated with an image will have its
240 * img_data flag set; a standalone object request will not.
241 *
242 * A standalone object request will have which == BAD_WHICH
243 * and a null obj_request pointer.
244 *
245 * An object request initiated in support of a layered image
246 * object (to check for its existence before a write) will
247 * have which == BAD_WHICH and a non-null obj_request pointer.
248 *
249 * Finally, an object request for rbd image data will have
250 * which != BAD_WHICH, and will have a non-null img_request
251 * pointer. The value of which will be in the range
252 * 0..(img_request->obj_request_count-1).
253 */
254 union {
255 struct rbd_obj_request *obj_request; /* STAT op */
256 struct {
257 struct rbd_img_request *img_request;
258 u64 img_offset;
259 /* links for img_request->obj_requests list */
260 struct list_head links;
261 };
262 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600263 u32 which; /* posn image request list */
264
265 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600266 union {
267 struct bio *bio_list;
268 struct {
269 struct page **pages;
270 u32 page_count;
271 };
272 };
Alex Elder0eefd472013-04-19 15:34:50 -0500273 struct page **copyup_pages;
Alex Elderebda6402013-05-10 16:29:22 -0500274 u32 copyup_page_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600275
276 struct ceph_osd_request *osd_req;
277
278 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800279 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600280
281 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600282 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600283
284 struct kref kref;
285};
286
Alex Elder0c425242013-02-08 09:55:49 -0600287enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600288 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
289 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600290 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800291 IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600292};
293
Alex Elderbf0d5f502012-11-22 00:00:08 -0600294struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600295 struct rbd_device *rbd_dev;
296 u64 offset; /* starting image byte offset */
297 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600298 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600299 union {
Alex Elder9849e982013-01-24 16:13:36 -0600300 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600301 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600302 };
303 union {
304 struct request *rq; /* block request */
305 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600306 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500307 struct page **copyup_pages;
Alex Elderebda6402013-05-10 16:29:22 -0500308 u32 copyup_page_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600309 spinlock_t completion_lock;/* protects next_completion */
310 u32 next_completion;
311 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500312 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600313 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600314
315 u32 obj_request_count;
316 struct list_head obj_requests; /* rbd_obj_request structs */
317
318 struct kref kref;
319};
320
321#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600322 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600323#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600324 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600325#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600326 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600327
Ilya Dryomov99d16942016-08-12 16:11:41 +0200328enum rbd_watch_state {
329 RBD_WATCH_STATE_UNREGISTERED,
330 RBD_WATCH_STATE_REGISTERED,
331 RBD_WATCH_STATE_ERROR,
332};
333
Ilya Dryomoved95b212016-08-12 16:40:02 +0200334enum rbd_lock_state {
335 RBD_LOCK_STATE_UNLOCKED,
336 RBD_LOCK_STATE_LOCKED,
337 RBD_LOCK_STATE_RELEASING,
338};
339
340/* WatchNotify::ClientId */
341struct rbd_client_id {
342 u64 gid;
343 u64 handle;
344};
345
Alex Elderf84344f2012-08-31 17:29:51 -0500346struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500347 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500348 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500349 bool read_only;
350};
351
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700352/*
353 * a single device
354 */
355struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500356 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700357
358 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200359 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700360 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700361
Alex Eldera30b71b2012-07-10 20:30:11 -0500362 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700363 struct rbd_client *rbd_client;
364
365 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
366
Alex Elderb82d1672013-01-14 12:43:31 -0600367 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700368
369 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600370 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500371 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300372 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200373 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700374
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200375 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200376 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500377
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200378 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600379
Ilya Dryomov99d16942016-08-12 16:11:41 +0200380 struct mutex watch_mutex;
381 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200382 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200383 u64 watch_cookie;
384 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700385
Ilya Dryomoved95b212016-08-12 16:40:02 +0200386 struct rw_semaphore lock_rwsem;
387 enum rbd_lock_state lock_state;
388 struct rbd_client_id owner_cid;
389 struct work_struct acquired_lock_work;
390 struct work_struct released_lock_work;
391 struct delayed_work lock_dwork;
392 struct work_struct unlock_work;
393 wait_queue_head_t lock_waitq;
394
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200395 struct workqueue_struct *task_wq;
396
Alex Elder86b00e02012-10-25 23:34:42 -0500397 struct rbd_spec *parent_spec;
398 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500399 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500400 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500401
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100402 /* Block layer tags. */
403 struct blk_mq_tag_set tag_set;
404
Josh Durginc6666012011-11-21 17:11:12 -0800405 /* protects updating the header */
406 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500407
408 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700409
410 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800411
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800412 /* sysfs related */
413 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600414 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800415};
416
Alex Elderb82d1672013-01-14 12:43:31 -0600417/*
418 * Flag bits for rbd_dev->flags. If atomicity is required,
419 * rbd_dev->lock is used to protect access.
420 *
421 * Currently, only the "removing" flag (which is coupled with the
422 * "open_count" field) requires atomic access.
423 */
Alex Elder6d292902013-01-14 12:43:31 -0600424enum rbd_dev_flags {
425 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600426 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600427};
428
Alex Eldercfbf6372013-05-31 17:40:45 -0500429static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600430
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600432static DEFINE_SPINLOCK(rbd_dev_list_lock);
433
Alex Elder432b8582012-01-29 13:57:44 -0600434static LIST_HEAD(rbd_client_list); /* clients */
435static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436
Alex Elder78c2a442013-05-01 12:43:04 -0500437/* Slab caches for frequently-allocated structures */
438
Alex Elder1c2a9df2013-05-01 12:43:03 -0500439static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500440static struct kmem_cache *rbd_obj_request_cache;
Alex Elder78c2a442013-05-01 12:43:04 -0500441static struct kmem_cache *rbd_segment_name_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500442
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200443static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200444static DEFINE_IDA(rbd_dev_id_ida);
445
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400446static struct workqueue_struct *rbd_wq;
447
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200448/*
449 * Default to false for now, as single-major requires >= 0.75 version of
450 * userspace rbd utility.
451 */
452static bool single_major = false;
453module_param(single_major, bool, S_IRUGO);
454MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
455
Alex Elder3d7efd12013-04-19 15:34:50 -0500456static int rbd_img_request_submit(struct rbd_img_request *img_request);
457
Alex Elderf0f8cef2012-01-29 13:57:44 -0600458static ssize_t rbd_add(struct bus_type *bus, const char *buf,
459 size_t count);
460static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
461 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200462static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
463 size_t count);
464static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
465 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200466static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Eldera2acd002013-05-08 22:50:04 -0500467static void rbd_spec_put(struct rbd_spec *spec);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600468
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200469static int rbd_dev_id_to_minor(int dev_id)
470{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200471 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200472}
473
474static int minor_to_rbd_dev_id(int minor)
475{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200476 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200477}
478
Ilya Dryomoved95b212016-08-12 16:40:02 +0200479static bool rbd_is_lock_supported(struct rbd_device *rbd_dev)
480{
481 return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
482 rbd_dev->spec->snap_id == CEPH_NOSNAP &&
483 !rbd_dev->mapping.read_only;
484}
485
486static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
487{
488 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
489 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
490}
491
492static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
493{
494 bool is_lock_owner;
495
496 down_read(&rbd_dev->lock_rwsem);
497 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
498 up_read(&rbd_dev->lock_rwsem);
499 return is_lock_owner;
500}
501
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700502static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
503static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200504static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
505static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700506
507static struct attribute *rbd_bus_attrs[] = {
508 &bus_attr_add.attr,
509 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200510 &bus_attr_add_single_major.attr,
511 &bus_attr_remove_single_major.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700512 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600513};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200514
515static umode_t rbd_bus_is_visible(struct kobject *kobj,
516 struct attribute *attr, int index)
517{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200518 if (!single_major &&
519 (attr == &bus_attr_add_single_major.attr ||
520 attr == &bus_attr_remove_single_major.attr))
521 return 0;
522
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200523 return attr->mode;
524}
525
526static const struct attribute_group rbd_bus_group = {
527 .attrs = rbd_bus_attrs,
528 .is_visible = rbd_bus_is_visible,
529};
530__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600531
532static struct bus_type rbd_bus_type = {
533 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700534 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600535};
536
537static void rbd_root_dev_release(struct device *dev)
538{
539}
540
541static struct device rbd_root_dev = {
542 .init_name = "rbd",
543 .release = rbd_root_dev_release,
544};
545
Alex Elder06ecc6c2012-11-01 10:17:15 -0500546static __printf(2, 3)
547void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
548{
549 struct va_format vaf;
550 va_list args;
551
552 va_start(args, fmt);
553 vaf.fmt = fmt;
554 vaf.va = &args;
555
556 if (!rbd_dev)
557 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
558 else if (rbd_dev->disk)
559 printk(KERN_WARNING "%s: %s: %pV\n",
560 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
561 else if (rbd_dev->spec && rbd_dev->spec->image_name)
562 printk(KERN_WARNING "%s: image %s: %pV\n",
563 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
564 else if (rbd_dev->spec && rbd_dev->spec->image_id)
565 printk(KERN_WARNING "%s: id %s: %pV\n",
566 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
567 else /* punt */
568 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
569 RBD_DRV_NAME, rbd_dev, &vaf);
570 va_end(args);
571}
572
Alex Elderaafb2302012-09-06 16:00:54 -0500573#ifdef RBD_DEBUG
574#define rbd_assert(expr) \
575 if (unlikely(!(expr))) { \
576 printk(KERN_ERR "\nAssertion failure in %s() " \
577 "at line %d:\n\n" \
578 "\trbd_assert(%s);\n\n", \
579 __func__, __LINE__, #expr); \
580 BUG(); \
581 }
582#else /* !RBD_DEBUG */
583# define rbd_assert(expr) ((void) 0)
584#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800585
Ilya Dryomov27617132015-07-16 17:36:11 +0300586static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
Alex Elderb454e362013-04-19 15:34:50 -0500587static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder05a46af2013-04-26 15:44:36 -0500588static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
589static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600590
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500591static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500592static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400593static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400594static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500595static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
596 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500597static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
598 u8 *order, u64 *snap_size);
599static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
600 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700601
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700602static int rbd_open(struct block_device *bdev, fmode_t mode)
603{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600604 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600605 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606
Alex Elderf84344f2012-08-31 17:29:51 -0500607 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700608 return -EROFS;
609
Alex Eldera14ea262013-02-05 13:23:12 -0600610 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600611 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
612 removing = true;
613 else
614 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600615 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600616 if (removing)
617 return -ENOENT;
618
Alex Elderc3e946c2012-11-16 09:29:16 -0600619 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700620
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621 return 0;
622}
623
Al Virodb2a1442013-05-05 21:52:57 -0400624static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800625{
626 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600627 unsigned long open_count_before;
628
Alex Eldera14ea262013-02-05 13:23:12 -0600629 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600630 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600631 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600632 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800633
Alex Elderc3e946c2012-11-16 09:29:16 -0600634 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800635}
636
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800637static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
638{
Josh Durgin77f33c02013-09-30 17:09:54 -0700639 int ret = 0;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800640 int val;
641 bool ro;
Josh Durgin77f33c02013-09-30 17:09:54 -0700642 bool ro_changed = false;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800643
Josh Durgin77f33c02013-09-30 17:09:54 -0700644 /* get_user() may sleep, so call it before taking rbd_dev->lock */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800645 if (get_user(val, (int __user *)(arg)))
646 return -EFAULT;
647
648 ro = val ? true : false;
649 /* Snapshot doesn't allow to write*/
650 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
651 return -EROFS;
652
Josh Durgin77f33c02013-09-30 17:09:54 -0700653 spin_lock_irq(&rbd_dev->lock);
654 /* prevent others open this device */
655 if (rbd_dev->open_count > 1) {
656 ret = -EBUSY;
657 goto out;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800658 }
659
Josh Durgin77f33c02013-09-30 17:09:54 -0700660 if (rbd_dev->mapping.read_only != ro) {
661 rbd_dev->mapping.read_only = ro;
662 ro_changed = true;
663 }
664
665out:
666 spin_unlock_irq(&rbd_dev->lock);
667 /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
668 if (ret == 0 && ro_changed)
669 set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
670
671 return ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800672}
673
674static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
675 unsigned int cmd, unsigned long arg)
676{
677 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
678 int ret = 0;
679
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800680 switch (cmd) {
681 case BLKROSET:
682 ret = rbd_ioctl_set_ro(rbd_dev, arg);
683 break;
684 default:
685 ret = -ENOTTY;
686 }
687
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800688 return ret;
689}
690
691#ifdef CONFIG_COMPAT
692static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
693 unsigned int cmd, unsigned long arg)
694{
695 return rbd_ioctl(bdev, mode, cmd, arg);
696}
697#endif /* CONFIG_COMPAT */
698
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699static const struct block_device_operations rbd_bd_ops = {
700 .owner = THIS_MODULE,
701 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800702 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800703 .ioctl = rbd_ioctl,
704#ifdef CONFIG_COMPAT
705 .compat_ioctl = rbd_compat_ioctl,
706#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707};
708
709/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500710 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500711 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712 */
Alex Elderf8c38922012-08-10 13:12:07 -0700713static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700714{
715 struct rbd_client *rbdc;
716 int ret = -ENOMEM;
717
Alex Elder37206ee2013-02-20 17:32:08 -0600718 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
720 if (!rbdc)
721 goto out_opt;
722
723 kref_init(&rbdc->kref);
724 INIT_LIST_HEAD(&rbdc->node);
725
Alex Elder43ae4702012-07-03 16:01:18 -0500726 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500728 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500729 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700730
731 ret = ceph_open_session(rbdc->client);
732 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500733 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700734
Alex Elder432b8582012-01-29 13:57:44 -0600735 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700736 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600737 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738
Alex Elder37206ee2013-02-20 17:32:08 -0600739 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600740
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700741 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500742out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500744out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745 kfree(rbdc);
746out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500747 if (ceph_opts)
748 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600749 dout("%s: error %d\n", __func__, ret);
750
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400751 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700752}
753
Alex Elder2f82ee52012-10-30 19:40:33 -0500754static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
755{
756 kref_get(&rbdc->kref);
757
758 return rbdc;
759}
760
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700761/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700762 * Find a ceph client with specific addr and configuration. If
763 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700764 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700765static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700766{
767 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700768 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700769
Alex Elder43ae4702012-07-03 16:01:18 -0500770 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700771 return NULL;
772
Alex Elder1f7ba332012-08-10 13:12:07 -0700773 spin_lock(&rbd_client_list_lock);
774 list_for_each_entry(client_node, &rbd_client_list, node) {
775 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500776 __rbd_get_client(client_node);
777
Alex Elder1f7ba332012-08-10 13:12:07 -0700778 found = true;
779 break;
780 }
781 }
782 spin_unlock(&rbd_client_list_lock);
783
784 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700785}
786
787/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300788 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700789 */
790enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300791 Opt_queue_depth,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700792 Opt_last_int,
793 /* int args above */
794 Opt_last_string,
795 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700796 Opt_read_only,
797 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200798 Opt_lock_on_read,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300799 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700800};
801
Alex Elder43ae4702012-07-03 16:01:18 -0500802static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300803 {Opt_queue_depth, "queue_depth=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700804 /* int args above */
805 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500806 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700807 {Opt_read_only, "ro"}, /* Alternate spelling */
808 {Opt_read_write, "read_write"},
809 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200810 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300811 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700812};
813
Alex Elder98571b52013-01-20 14:44:42 -0600814struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300815 int queue_depth;
Alex Elder98571b52013-01-20 14:44:42 -0600816 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200817 bool lock_on_read;
Alex Elder98571b52013-01-20 14:44:42 -0600818};
819
Ilya Dryomovb5584182015-06-23 16:21:19 +0300820#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Alex Elder98571b52013-01-20 14:44:42 -0600821#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200822#define RBD_LOCK_ON_READ_DEFAULT false
Alex Elder98571b52013-01-20 14:44:42 -0600823
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700824static int parse_rbd_opts_token(char *c, void *private)
825{
Alex Elder43ae4702012-07-03 16:01:18 -0500826 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700827 substring_t argstr[MAX_OPT_ARGS];
828 int token, intval, ret;
829
Alex Elder43ae4702012-07-03 16:01:18 -0500830 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700831 if (token < Opt_last_int) {
832 ret = match_int(&argstr[0], &intval);
833 if (ret < 0) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300834 pr_err("bad mount option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700835 return ret;
836 }
837 dout("got int token %d val %d\n", token, intval);
838 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300839 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700840 } else {
841 dout("got token %d\n", token);
842 }
843
844 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300845 case Opt_queue_depth:
846 if (intval < 1) {
847 pr_err("queue_depth out of range\n");
848 return -EINVAL;
849 }
850 rbd_opts->queue_depth = intval;
851 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700852 case Opt_read_only:
853 rbd_opts->read_only = true;
854 break;
855 case Opt_read_write:
856 rbd_opts->read_only = false;
857 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200858 case Opt_lock_on_read:
859 rbd_opts->lock_on_read = true;
860 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700861 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300862 /* libceph prints "bad option" msg */
863 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700864 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300865
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700866 return 0;
867}
868
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800869static char* obj_op_name(enum obj_operation_type op_type)
870{
871 switch (op_type) {
872 case OBJ_OP_READ:
873 return "read";
874 case OBJ_OP_WRITE:
875 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800876 case OBJ_OP_DISCARD:
877 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800878 default:
879 return "???";
880 }
881}
882
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700883/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 * Get a ceph client with specific addr and configuration, if one does
Alex Elder7262cfc2013-05-16 15:04:20 -0500885 * not exist create it. Either way, ceph_opts is consumed by this
886 * function.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500888static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889{
Alex Elderf8c38922012-08-10 13:12:07 -0700890 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700891
Alex Eldercfbf6372013-05-31 17:40:45 -0500892 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
Alex Elder1f7ba332012-08-10 13:12:07 -0700893 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500894 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500895 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500896 else
Alex Elderf8c38922012-08-10 13:12:07 -0700897 rbdc = rbd_client_create(ceph_opts);
Alex Eldercfbf6372013-05-31 17:40:45 -0500898 mutex_unlock(&client_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700899
Alex Elder9d3997f2012-10-25 23:34:42 -0500900 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901}
902
903/*
904 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600905 *
Alex Elder432b8582012-01-29 13:57:44 -0600906 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700907 */
908static void rbd_client_release(struct kref *kref)
909{
910 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
911
Alex Elder37206ee2013-02-20 17:32:08 -0600912 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500913 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700914 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500915 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700916
917 ceph_destroy_client(rbdc->client);
918 kfree(rbdc);
919}
920
921/*
922 * Drop reference to ceph client node. If it's not referenced anymore, release
923 * it.
924 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500925static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700926{
Alex Elderc53d5892012-10-25 23:34:42 -0500927 if (rbdc)
928 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700929}
930
Alex Eldera30b71b2012-07-10 20:30:11 -0500931static bool rbd_image_format_valid(u32 image_format)
932{
933 return image_format == 1 || image_format == 2;
934}
935
Alex Elder8e94af82012-07-25 09:32:40 -0500936static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
937{
Alex Elder103a1502012-08-02 11:29:45 -0500938 size_t size;
939 u32 snap_count;
940
941 /* The header has to start with the magic rbd header text */
942 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
943 return false;
944
Alex Elderdb2388b2012-10-20 22:17:27 -0500945 /* The bio layer requires at least sector-sized I/O */
946
947 if (ondisk->options.order < SECTOR_SHIFT)
948 return false;
949
950 /* If we use u64 in a few spots we may be able to loosen this */
951
952 if (ondisk->options.order > 8 * sizeof (int) - 1)
953 return false;
954
Alex Elder103a1502012-08-02 11:29:45 -0500955 /*
956 * The size of a snapshot header has to fit in a size_t, and
957 * that limits the number of snapshots.
958 */
959 snap_count = le32_to_cpu(ondisk->snap_count);
960 size = SIZE_MAX - sizeof (struct ceph_snap_context);
961 if (snap_count > size / sizeof (__le64))
962 return false;
963
964 /*
965 * Not only that, but the size of the entire the snapshot
966 * header must also be representable in a size_t.
967 */
968 size -= snap_count * sizeof (__le64);
969 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
970 return false;
971
972 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500973}
974
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975/*
Alex Elderbb23e372013-05-06 09:51:29 -0500976 * Fill an rbd image header with information from the given format 1
977 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700978 */
Alex Elder662518b2013-05-06 09:51:29 -0500979static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -0500980 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700981{
Alex Elder662518b2013-05-06 09:51:29 -0500982 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -0500983 bool first_time = header->object_prefix == NULL;
984 struct ceph_snap_context *snapc;
985 char *object_prefix = NULL;
986 char *snap_names = NULL;
987 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500988 u32 snap_count;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500989 size_t size;
Alex Elderbb23e372013-05-06 09:51:29 -0500990 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -0500991 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700992
Alex Elderbb23e372013-05-06 09:51:29 -0500993 /* Allocate this now to avoid having to handle failure below */
994
995 if (first_time) {
996 size_t len;
997
998 len = strnlen(ondisk->object_prefix,
999 sizeof (ondisk->object_prefix));
1000 object_prefix = kmalloc(len + 1, GFP_KERNEL);
1001 if (!object_prefix)
1002 return -ENOMEM;
1003 memcpy(object_prefix, ondisk->object_prefix, len);
1004 object_prefix[len] = '\0';
1005 }
1006
1007 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001008
Alex Elder103a1502012-08-02 11:29:45 -05001009 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001010 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1011 if (!snapc)
1012 goto out_err;
1013 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001014 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001015 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001016 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1017
Alex Elderbb23e372013-05-06 09:51:29 -05001018 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001019
Alex Elderbb23e372013-05-06 09:51:29 -05001020 if (snap_names_len > (u64)SIZE_MAX)
1021 goto out_2big;
1022 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1023 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001024 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001025
1026 /* ...as well as the array of their sizes. */
1027
1028 size = snap_count * sizeof (*header->snap_sizes);
1029 snap_sizes = kmalloc(size, GFP_KERNEL);
1030 if (!snap_sizes)
1031 goto out_err;
1032
Alex Elderf785cc12012-08-23 23:22:06 -05001033 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001034 * Copy the names, and fill in each snapshot's id
1035 * and size.
1036 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001037 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001038 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001039 * snap_names_len bytes beyond the end of the
1040 * snapshot id array, this memcpy() is safe.
1041 */
Alex Elderbb23e372013-05-06 09:51:29 -05001042 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1043 snaps = ondisk->snaps;
1044 for (i = 0; i < snap_count; i++) {
1045 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1046 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1047 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001048 }
Alex Elder849b4262012-07-09 21:04:24 -05001049
Alex Elderbb23e372013-05-06 09:51:29 -05001050 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001051
Alex Elderbb23e372013-05-06 09:51:29 -05001052 if (first_time) {
1053 header->object_prefix = object_prefix;
1054 header->obj_order = ondisk->options.order;
1055 header->crypt_type = ondisk->options.crypt_type;
1056 header->comp_type = ondisk->options.comp_type;
1057 /* The rest aren't used for format 1 images */
1058 header->stripe_unit = 0;
1059 header->stripe_count = 0;
1060 header->features = 0;
Alex Elder662518b2013-05-06 09:51:29 -05001061 } else {
1062 ceph_put_snap_context(header->snapc);
1063 kfree(header->snap_names);
1064 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001065 }
1066
1067 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001068
Alex Elderf84344f2012-08-31 17:29:51 -05001069 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001070 header->snapc = snapc;
1071 header->snap_names = snap_names;
1072 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001073
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001074 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001075out_2big:
1076 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001077out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001078 kfree(snap_sizes);
1079 kfree(snap_names);
1080 ceph_put_snap_context(snapc);
1081 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001082
Alex Elderbb23e372013-05-06 09:51:29 -05001083 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084}
1085
Alex Elder9682fc62013-04-30 00:44:33 -05001086static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1087{
1088 const char *snap_name;
1089
1090 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1091
1092 /* Skip over names until we find the one we are looking for */
1093
1094 snap_name = rbd_dev->header.snap_names;
1095 while (which--)
1096 snap_name += strlen(snap_name) + 1;
1097
1098 return kstrdup(snap_name, GFP_KERNEL);
1099}
1100
Alex Elder30d1cff2013-05-01 12:43:03 -05001101/*
1102 * Snapshot id comparison function for use with qsort()/bsearch().
1103 * Note that result is for snapshots in *descending* order.
1104 */
1105static int snapid_compare_reverse(const void *s1, const void *s2)
1106{
1107 u64 snap_id1 = *(u64 *)s1;
1108 u64 snap_id2 = *(u64 *)s2;
1109
1110 if (snap_id1 < snap_id2)
1111 return 1;
1112 return snap_id1 == snap_id2 ? 0 : -1;
1113}
1114
1115/*
1116 * Search a snapshot context to see if the given snapshot id is
1117 * present.
1118 *
1119 * Returns the position of the snapshot id in the array if it's found,
1120 * or BAD_SNAP_INDEX otherwise.
1121 *
1122 * Note: The snapshot array is in kept sorted (by the osd) in
1123 * reverse order, highest snapshot id first.
1124 */
Alex Elder9682fc62013-04-30 00:44:33 -05001125static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1126{
1127 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001128 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001129
Alex Elder30d1cff2013-05-01 12:43:03 -05001130 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1131 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001132
Alex Elder30d1cff2013-05-01 12:43:03 -05001133 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001134}
1135
Alex Elder2ad3d712013-04-30 00:44:33 -05001136static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1137 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001138{
1139 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001140 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001141
1142 which = rbd_dev_snap_index(rbd_dev, snap_id);
1143 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001144 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001145
Josh Durginda6a6b62013-09-04 17:57:31 -07001146 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1147 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001148}
1149
Alex Elder9e15b772012-10-30 19:40:33 -05001150static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1151{
Alex Elder9e15b772012-10-30 19:40:33 -05001152 if (snap_id == CEPH_NOSNAP)
1153 return RBD_SNAP_HEAD_NAME;
1154
Alex Elder54cac612013-04-30 00:44:33 -05001155 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1156 if (rbd_dev->image_format == 1)
1157 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001158
Alex Elder54cac612013-04-30 00:44:33 -05001159 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001160}
1161
Alex Elder2ad3d712013-04-30 00:44:33 -05001162static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1163 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164{
Alex Elder2ad3d712013-04-30 00:44:33 -05001165 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1166 if (snap_id == CEPH_NOSNAP) {
1167 *snap_size = rbd_dev->header.image_size;
1168 } else if (rbd_dev->image_format == 1) {
1169 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001170
Alex Elder2ad3d712013-04-30 00:44:33 -05001171 which = rbd_dev_snap_index(rbd_dev, snap_id);
1172 if (which == BAD_SNAP_INDEX)
1173 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001174
Alex Elder2ad3d712013-04-30 00:44:33 -05001175 *snap_size = rbd_dev->header.snap_sizes[which];
1176 } else {
1177 u64 size = 0;
1178 int ret;
1179
1180 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1181 if (ret)
1182 return ret;
1183
1184 *snap_size = size;
1185 }
1186 return 0;
1187}
1188
1189static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1190 u64 *snap_features)
1191{
1192 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1193 if (snap_id == CEPH_NOSNAP) {
1194 *snap_features = rbd_dev->header.features;
1195 } else if (rbd_dev->image_format == 1) {
1196 *snap_features = 0; /* No features for format 1 */
1197 } else {
1198 u64 features = 0;
1199 int ret;
1200
1201 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1202 if (ret)
1203 return ret;
1204
1205 *snap_features = features;
1206 }
1207 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208}
1209
Alex Elderd1cf5782013-04-27 09:59:30 -05001210static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001211{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001212 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001213 u64 size = 0;
1214 u64 features = 0;
1215 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001216
Alex Elder2ad3d712013-04-30 00:44:33 -05001217 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1218 if (ret)
1219 return ret;
1220 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1221 if (ret)
1222 return ret;
1223
1224 rbd_dev->mapping.size = size;
1225 rbd_dev->mapping.features = features;
1226
Alex Elder8b0241f2013-04-25 23:15:08 -05001227 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001228}
1229
Alex Elderd1cf5782013-04-27 09:59:30 -05001230static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1231{
1232 rbd_dev->mapping.size = 0;
1233 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001234}
1235
Himangi Saraogi7d5079a2014-07-24 03:17:07 +05301236static void rbd_segment_name_free(const char *name)
1237{
1238 /* The explicit cast here is needed to drop the const qualifier */
1239
1240 kmem_cache_free(rbd_segment_name_cache, (void *)name);
1241}
1242
Alex Elder98571b52013-01-20 14:44:42 -06001243static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001244{
Alex Elder65ccfe22012-08-09 10:33:26 -07001245 char *name;
1246 u64 segment;
1247 int ret;
Josh Durgin3a96d5c2013-06-12 19:15:06 -07001248 char *name_format;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001249
Alex Elder78c2a442013-05-01 12:43:04 -05001250 name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -07001251 if (!name)
1252 return NULL;
1253 segment = offset >> rbd_dev->header.obj_order;
Josh Durgin3a96d5c2013-06-12 19:15:06 -07001254 name_format = "%s.%012llx";
1255 if (rbd_dev->image_format == 2)
1256 name_format = "%s.%016llx";
Ilya Dryomov2d0ebc52014-01-27 17:40:18 +02001257 ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
Alex Elder65ccfe22012-08-09 10:33:26 -07001258 rbd_dev->header.object_prefix, segment);
Ilya Dryomov2d0ebc52014-01-27 17:40:18 +02001259 if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
Alex Elder65ccfe22012-08-09 10:33:26 -07001260 pr_err("error formatting segment name for #%llu (%d)\n",
1261 segment, ret);
Himangi Saraogi7d5079a2014-07-24 03:17:07 +05301262 rbd_segment_name_free(name);
Alex Elder65ccfe22012-08-09 10:33:26 -07001263 name = NULL;
1264 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001265
Alex Elder65ccfe22012-08-09 10:33:26 -07001266 return name;
1267}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001268
Alex Elder65ccfe22012-08-09 10:33:26 -07001269static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1270{
1271 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272
Alex Elder65ccfe22012-08-09 10:33:26 -07001273 return offset & (segment_size - 1);
1274}
1275
1276static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1277 u64 offset, u64 length)
1278{
1279 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1280
1281 offset &= segment_size - 1;
1282
Alex Elderaafb2302012-09-06 16:00:54 -05001283 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001284 if (offset + length > segment_size)
1285 length = segment_size - offset;
1286
1287 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001288}
1289
1290/*
Josh Durgin029bcbd2011-07-22 11:35:23 -07001291 * returns the size of an object in the image
1292 */
1293static u64 rbd_obj_bytes(struct rbd_image_header *header)
1294{
1295 return 1 << header->obj_order;
1296}
1297
1298/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001299 * bio helpers
1300 */
1301
1302static void bio_chain_put(struct bio *chain)
1303{
1304 struct bio *tmp;
1305
1306 while (chain) {
1307 tmp = chain;
1308 chain = chain->bi_next;
1309 bio_put(tmp);
1310 }
1311}
1312
1313/*
1314 * zeros a bio chain, starting at specific offset
1315 */
1316static void zero_bio_chain(struct bio *chain, int start_ofs)
1317{
Kent Overstreet79886132013-11-23 17:19:00 -08001318 struct bio_vec bv;
1319 struct bvec_iter iter;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001320 unsigned long flags;
1321 void *buf;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001322 int pos = 0;
1323
1324 while (chain) {
Kent Overstreet79886132013-11-23 17:19:00 -08001325 bio_for_each_segment(bv, chain, iter) {
1326 if (pos + bv.bv_len > start_ofs) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001327 int remainder = max(start_ofs - pos, 0);
Kent Overstreet79886132013-11-23 17:19:00 -08001328 buf = bvec_kmap_irq(&bv, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001329 memset(buf + remainder, 0,
Kent Overstreet79886132013-11-23 17:19:00 -08001330 bv.bv_len - remainder);
1331 flush_dcache_page(bv.bv_page);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +02001332 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001333 }
Kent Overstreet79886132013-11-23 17:19:00 -08001334 pos += bv.bv_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001335 }
1336
1337 chain = chain->bi_next;
1338 }
1339}
1340
1341/*
Alex Elderb9434c52013-04-19 15:34:50 -05001342 * similar to zero_bio_chain(), zeros data defined by a page array,
1343 * starting at the given byte offset from the start of the array and
1344 * continuing up to the given end offset. The pages array is
1345 * assumed to be big enough to hold all bytes up to the end.
1346 */
1347static void zero_pages(struct page **pages, u64 offset, u64 end)
1348{
1349 struct page **page = &pages[offset >> PAGE_SHIFT];
1350
1351 rbd_assert(end > offset);
1352 rbd_assert(end - offset <= (u64)SIZE_MAX);
1353 while (offset < end) {
1354 size_t page_offset;
1355 size_t length;
1356 unsigned long flags;
1357 void *kaddr;
1358
Geert Uytterhoeven491205a2013-05-13 20:35:37 -05001359 page_offset = offset & ~PAGE_MASK;
1360 length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
Alex Elderb9434c52013-04-19 15:34:50 -05001361 local_irq_save(flags);
1362 kaddr = kmap_atomic(*page);
1363 memset(kaddr + page_offset, 0, length);
Alex Eldere2156052013-05-22 20:54:25 -05001364 flush_dcache_page(*page);
Alex Elderb9434c52013-04-19 15:34:50 -05001365 kunmap_atomic(kaddr);
1366 local_irq_restore(flags);
1367
1368 offset += length;
1369 page++;
1370 }
1371}
1372
1373/*
Alex Elderf7760da2012-10-20 22:17:27 -05001374 * Clone a portion of a bio, starting at the given byte offset
1375 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001376 */
Alex Elderf7760da2012-10-20 22:17:27 -05001377static struct bio *bio_clone_range(struct bio *bio_src,
1378 unsigned int offset,
1379 unsigned int len,
1380 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001381{
Alex Elderf7760da2012-10-20 22:17:27 -05001382 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001383
Kent Overstreet5341a6272013-08-07 14:31:11 -07001384 bio = bio_clone(bio_src, gfpmask);
Alex Elderf7760da2012-10-20 22:17:27 -05001385 if (!bio)
1386 return NULL; /* ENOMEM */
1387
Kent Overstreet5341a6272013-08-07 14:31:11 -07001388 bio_advance(bio, offset);
Kent Overstreet4f024f32013-10-11 15:44:27 -07001389 bio->bi_iter.bi_size = len;
Alex Elder542582f2012-08-09 10:33:25 -07001390
Alex Elderf7760da2012-10-20 22:17:27 -05001391 return bio;
1392}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001393
Alex Elderf7760da2012-10-20 22:17:27 -05001394/*
1395 * Clone a portion of a bio chain, starting at the given byte offset
1396 * into the first bio in the source chain and continuing for the
1397 * number of bytes indicated. The result is another bio chain of
1398 * exactly the given length, or a null pointer on error.
1399 *
1400 * The bio_src and offset parameters are both in-out. On entry they
1401 * refer to the first source bio and the offset into that bio where
1402 * the start of data to be cloned is located.
1403 *
1404 * On return, bio_src is updated to refer to the bio in the source
1405 * chain that contains first un-cloned byte, and *offset will
1406 * contain the offset of that byte within that bio.
1407 */
1408static struct bio *bio_chain_clone_range(struct bio **bio_src,
1409 unsigned int *offset,
1410 unsigned int len,
1411 gfp_t gfpmask)
1412{
1413 struct bio *bi = *bio_src;
1414 unsigned int off = *offset;
1415 struct bio *chain = NULL;
1416 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001417
Alex Elderf7760da2012-10-20 22:17:27 -05001418 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001419
Kent Overstreet4f024f32013-10-11 15:44:27 -07001420 if (!bi || off >= bi->bi_iter.bi_size || !len)
Alex Elderf7760da2012-10-20 22:17:27 -05001421 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001422
Alex Elderf7760da2012-10-20 22:17:27 -05001423 end = &chain;
1424 while (len) {
1425 unsigned int bi_size;
1426 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001427
Alex Elderf5400b72012-11-01 10:17:15 -05001428 if (!bi) {
1429 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001430 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001431 }
Kent Overstreet4f024f32013-10-11 15:44:27 -07001432 bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
Alex Elderf7760da2012-10-20 22:17:27 -05001433 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1434 if (!bio)
1435 goto out_err; /* ENOMEM */
1436
1437 *end = bio;
1438 end = &bio->bi_next;
1439
1440 off += bi_size;
Kent Overstreet4f024f32013-10-11 15:44:27 -07001441 if (off == bi->bi_iter.bi_size) {
Alex Elderf7760da2012-10-20 22:17:27 -05001442 bi = bi->bi_next;
1443 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001444 }
Alex Elderf7760da2012-10-20 22:17:27 -05001445 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001446 }
Alex Elderf7760da2012-10-20 22:17:27 -05001447 *bio_src = bi;
1448 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001449
Alex Elderf7760da2012-10-20 22:17:27 -05001450 return chain;
1451out_err:
1452 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001453
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001454 return NULL;
1455}
1456
Alex Elder926f9b32013-02-11 12:33:24 -06001457/*
1458 * The default/initial value for all object request flags is 0. For
1459 * each flag, once its value is set to 1 it is never reset to 0
1460 * again.
1461 */
Alex Elder6365d332013-02-11 12:33:24 -06001462static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1463{
1464 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001465 struct rbd_device *rbd_dev;
1466
Alex Elder57acbaa2013-02-11 12:33:24 -06001467 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001468 rbd_warn(rbd_dev, "obj_request %p already marked img_data",
Alex Elder6365d332013-02-11 12:33:24 -06001469 obj_request);
1470 }
1471}
1472
1473static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1474{
1475 smp_mb();
1476 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1477}
1478
Alex Elder57acbaa2013-02-11 12:33:24 -06001479static void obj_request_done_set(struct rbd_obj_request *obj_request)
1480{
1481 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1482 struct rbd_device *rbd_dev = NULL;
1483
1484 if (obj_request_img_data_test(obj_request))
1485 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001486 rbd_warn(rbd_dev, "obj_request %p already marked done",
Alex Elder57acbaa2013-02-11 12:33:24 -06001487 obj_request);
1488 }
1489}
1490
1491static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1492{
1493 smp_mb();
1494 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1495}
1496
Alex Elder5679c592013-02-11 12:33:24 -06001497/*
1498 * This sets the KNOWN flag after (possibly) setting the EXISTS
1499 * flag. The latter is set based on the "exists" value provided.
1500 *
1501 * Note that for our purposes once an object exists it never goes
1502 * away again. It's possible that the response from two existence
1503 * checks are separated by the creation of the target object, and
1504 * the first ("doesn't exist") response arrives *after* the second
1505 * ("does exist"). In that case we ignore the second one.
1506 */
1507static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1508 bool exists)
1509{
1510 if (exists)
1511 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1512 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1513 smp_mb();
1514}
1515
1516static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1517{
1518 smp_mb();
1519 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1520}
1521
1522static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1523{
1524 smp_mb();
1525 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1526}
1527
Ilya Dryomov96385562014-06-10 13:53:29 +04001528static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1529{
1530 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1531
1532 return obj_request->img_offset <
1533 round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1534}
1535
Alex Elderbf0d5f502012-11-22 00:00:08 -06001536static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1537{
Alex Elder37206ee2013-02-20 17:32:08 -06001538 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1539 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001540 kref_get(&obj_request->kref);
1541}
1542
1543static void rbd_obj_request_destroy(struct kref *kref);
1544static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1545{
1546 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001547 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1548 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001549 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1550}
1551
Alex Elder0f2d5be2014-04-26 14:21:44 +04001552static void rbd_img_request_get(struct rbd_img_request *img_request)
1553{
1554 dout("%s: img %p (was %d)\n", __func__, img_request,
1555 atomic_read(&img_request->kref.refcount));
1556 kref_get(&img_request->kref);
1557}
1558
Alex Eldere93f3152013-05-08 22:50:04 -05001559static bool img_request_child_test(struct rbd_img_request *img_request);
1560static void rbd_parent_request_destroy(struct kref *kref);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001561static void rbd_img_request_destroy(struct kref *kref);
1562static void rbd_img_request_put(struct rbd_img_request *img_request)
1563{
1564 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001565 dout("%s: img %p (was %d)\n", __func__, img_request,
1566 atomic_read(&img_request->kref.refcount));
Alex Eldere93f3152013-05-08 22:50:04 -05001567 if (img_request_child_test(img_request))
1568 kref_put(&img_request->kref, rbd_parent_request_destroy);
1569 else
1570 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001571}
1572
1573static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1574 struct rbd_obj_request *obj_request)
1575{
Alex Elder25dcf952013-01-25 17:08:55 -06001576 rbd_assert(obj_request->img_request == NULL);
1577
Alex Elderb155e862013-04-15 14:50:37 -05001578 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001579 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001580 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001581 rbd_assert(!obj_request_img_data_test(obj_request));
1582 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001583 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001584 img_request->obj_request_count++;
1585 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001586 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1587 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001588}
1589
1590static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1591 struct rbd_obj_request *obj_request)
1592{
1593 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001594
Alex Elder37206ee2013-02-20 17:32:08 -06001595 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1596 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001597 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001598 rbd_assert(img_request->obj_request_count > 0);
1599 img_request->obj_request_count--;
1600 rbd_assert(obj_request->which == img_request->obj_request_count);
1601 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001602 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001603 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001604 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001605 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001606 rbd_obj_request_put(obj_request);
1607}
1608
1609static bool obj_request_type_valid(enum obj_request_type type)
1610{
1611 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001612 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001613 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001614 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001615 return true;
1616 default:
1617 return false;
1618 }
1619}
1620
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001621static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
1622
Ilya Dryomov980917f2016-09-12 18:59:42 +02001623static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001624{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001625 struct ceph_osd_request *osd_req = obj_request->osd_req;
1626
1627 dout("%s %p osd_req %p\n", __func__, obj_request, osd_req);
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001628 if (obj_request_img_data_test(obj_request)) {
1629 WARN_ON(obj_request->callback != rbd_img_obj_callback);
1630 rbd_img_request_get(obj_request->img_request);
1631 }
Ilya Dryomov980917f2016-09-12 18:59:42 +02001632 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001633}
1634
Ilya Dryomov71c20a02014-06-19 11:38:14 +04001635static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
1636{
1637 dout("%s %p\n", __func__, obj_request);
1638 ceph_osdc_cancel_request(obj_request->osd_req);
1639}
1640
1641/*
1642 * Wait for an object request to complete. If interrupted, cancel the
1643 * underlying osd request.
Ilya Dryomov2894e1d2015-05-12 19:53:24 +03001644 *
1645 * @timeout: in jiffies, 0 means "wait forever"
Ilya Dryomov71c20a02014-06-19 11:38:14 +04001646 */
Ilya Dryomov2894e1d2015-05-12 19:53:24 +03001647static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
1648 unsigned long timeout)
Ilya Dryomov71c20a02014-06-19 11:38:14 +04001649{
Ilya Dryomov2894e1d2015-05-12 19:53:24 +03001650 long ret;
Ilya Dryomov71c20a02014-06-19 11:38:14 +04001651
1652 dout("%s %p\n", __func__, obj_request);
Ilya Dryomov2894e1d2015-05-12 19:53:24 +03001653 ret = wait_for_completion_interruptible_timeout(
1654 &obj_request->completion,
1655 ceph_timeout_jiffies(timeout));
1656 if (ret <= 0) {
1657 if (ret == 0)
1658 ret = -ETIMEDOUT;
Ilya Dryomov71c20a02014-06-19 11:38:14 +04001659 rbd_obj_request_end(obj_request);
Ilya Dryomov2894e1d2015-05-12 19:53:24 +03001660 } else {
1661 ret = 0;
Ilya Dryomov71c20a02014-06-19 11:38:14 +04001662 }
1663
Ilya Dryomov2894e1d2015-05-12 19:53:24 +03001664 dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
1665 return ret;
1666}
1667
1668static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1669{
1670 return __rbd_obj_request_wait(obj_request, 0);
1671}
1672
Alex Elderbf0d5f502012-11-22 00:00:08 -06001673static void rbd_img_request_complete(struct rbd_img_request *img_request)
1674{
Alex Elder55f27e02013-04-10 12:34:25 -05001675
Alex Elder37206ee2013-02-20 17:32:08 -06001676 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001677
1678 /*
1679 * If no error occurred, compute the aggregate transfer
1680 * count for the image request. We could instead use
1681 * atomic64_cmpxchg() to update it as each object request
1682 * completes; not clear which way is better off hand.
1683 */
1684 if (!img_request->result) {
1685 struct rbd_obj_request *obj_request;
1686 u64 xferred = 0;
1687
1688 for_each_obj_request(img_request, obj_request)
1689 xferred += obj_request->xferred;
1690 img_request->xferred = xferred;
1691 }
1692
Alex Elderbf0d5f502012-11-22 00:00:08 -06001693 if (img_request->callback)
1694 img_request->callback(img_request);
1695 else
1696 rbd_img_request_put(img_request);
1697}
1698
Alex Elder0c425242013-02-08 09:55:49 -06001699/*
1700 * The default/initial value for all image request flags is 0. Each
1701 * is conditionally set to 1 at image request initialization time
1702 * and currently never change thereafter.
1703 */
1704static void img_request_write_set(struct rbd_img_request *img_request)
1705{
1706 set_bit(IMG_REQ_WRITE, &img_request->flags);
1707 smp_mb();
1708}
1709
1710static bool img_request_write_test(struct rbd_img_request *img_request)
1711{
1712 smp_mb();
1713 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1714}
1715
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001716/*
1717 * Set the discard flag when the img_request is an discard request
1718 */
1719static void img_request_discard_set(struct rbd_img_request *img_request)
1720{
1721 set_bit(IMG_REQ_DISCARD, &img_request->flags);
1722 smp_mb();
1723}
1724
1725static bool img_request_discard_test(struct rbd_img_request *img_request)
1726{
1727 smp_mb();
1728 return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
1729}
1730
Alex Elder9849e982013-01-24 16:13:36 -06001731static void img_request_child_set(struct rbd_img_request *img_request)
1732{
1733 set_bit(IMG_REQ_CHILD, &img_request->flags);
1734 smp_mb();
1735}
1736
Alex Eldere93f3152013-05-08 22:50:04 -05001737static void img_request_child_clear(struct rbd_img_request *img_request)
1738{
1739 clear_bit(IMG_REQ_CHILD, &img_request->flags);
1740 smp_mb();
1741}
1742
Alex Elder9849e982013-01-24 16:13:36 -06001743static bool img_request_child_test(struct rbd_img_request *img_request)
1744{
1745 smp_mb();
1746 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1747}
1748
Alex Elderd0b2e942013-01-24 16:13:36 -06001749static void img_request_layered_set(struct rbd_img_request *img_request)
1750{
1751 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1752 smp_mb();
1753}
1754
Alex Eldera2acd002013-05-08 22:50:04 -05001755static void img_request_layered_clear(struct rbd_img_request *img_request)
1756{
1757 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1758 smp_mb();
1759}
1760
Alex Elderd0b2e942013-01-24 16:13:36 -06001761static bool img_request_layered_test(struct rbd_img_request *img_request)
1762{
1763 smp_mb();
1764 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1765}
1766
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001767static enum obj_operation_type
1768rbd_img_request_op_type(struct rbd_img_request *img_request)
1769{
1770 if (img_request_write_test(img_request))
1771 return OBJ_OP_WRITE;
1772 else if (img_request_discard_test(img_request))
1773 return OBJ_OP_DISCARD;
1774 else
1775 return OBJ_OP_READ;
1776}
1777
Alex Elder6e2a4502013-03-27 09:16:30 -05001778static void
1779rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1780{
Alex Elderb9434c52013-04-19 15:34:50 -05001781 u64 xferred = obj_request->xferred;
1782 u64 length = obj_request->length;
1783
Alex Elder6e2a4502013-03-27 09:16:30 -05001784 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1785 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001786 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001787 /*
Josh Durgin17c1cc12013-08-26 17:55:38 -07001788 * ENOENT means a hole in the image. We zero-fill the entire
1789 * length of the request. A short read also implies zero-fill
1790 * to the end of the request. An error requires the whole
1791 * length of the request to be reported finished with an error
1792 * to the block layer. In each case we update the xferred
1793 * count to indicate the whole request was satisfied.
Alex Elder6e2a4502013-03-27 09:16:30 -05001794 */
Alex Elderb9434c52013-04-19 15:34:50 -05001795 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001796 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001797 if (obj_request->type == OBJ_REQUEST_BIO)
1798 zero_bio_chain(obj_request->bio_list, 0);
1799 else
1800 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001801 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001802 } else if (xferred < length && !obj_request->result) {
1803 if (obj_request->type == OBJ_REQUEST_BIO)
1804 zero_bio_chain(obj_request->bio_list, xferred);
1805 else
1806 zero_pages(obj_request->pages, xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001807 }
Josh Durgin17c1cc12013-08-26 17:55:38 -07001808 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001809 obj_request_done_set(obj_request);
1810}
1811
Alex Elderbf0d5f502012-11-22 00:00:08 -06001812static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1813{
Alex Elder37206ee2013-02-20 17:32:08 -06001814 dout("%s: obj %p cb %p\n", __func__, obj_request,
1815 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001816 if (obj_request->callback)
1817 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001818 else
1819 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001820}
1821
Alex Elderc47f9372013-02-26 14:23:07 -06001822static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001823{
Alex Elder57acbaa2013-02-11 12:33:24 -06001824 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001825 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001826 bool layered = false;
1827
1828 if (obj_request_img_data_test(obj_request)) {
1829 img_request = obj_request->img_request;
1830 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001831 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001832 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001833
1834 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1835 obj_request, img_request, obj_request->result,
1836 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001837 if (layered && obj_request->result == -ENOENT &&
1838 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001839 rbd_img_parent_read(obj_request);
1840 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001841 rbd_img_obj_request_read_callback(obj_request);
1842 else
1843 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001844}
1845
Alex Elderc47f9372013-02-26 14:23:07 -06001846static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001847{
Sage Weil1b83bef2013-02-25 16:11:12 -08001848 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1849 obj_request->result, obj_request->length);
1850 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001851 * There is no such thing as a successful short write. Set
1852 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001853 */
1854 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001855 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001856}
1857
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001858static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
1859{
1860 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1861 obj_request->result, obj_request->length);
1862 /*
1863 * There is no such thing as a successful short discard. Set
1864 * it to our originally-requested length.
1865 */
1866 obj_request->xferred = obj_request->length;
Josh Durgind0265de2014-04-07 16:54:10 -07001867 /* discarding a non-existent object is not a problem */
1868 if (obj_request->result == -ENOENT)
1869 obj_request->result = 0;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001870 obj_request_done_set(obj_request);
1871}
1872
Alex Elderfbfab532013-02-08 09:55:48 -06001873/*
1874 * For a simple stat call there's nothing to do. We'll do more if
1875 * this is part of a write sequence for a layered image.
1876 */
Alex Elderc47f9372013-02-26 14:23:07 -06001877static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001878{
Alex Elder37206ee2013-02-20 17:32:08 -06001879 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001880 obj_request_done_set(obj_request);
1881}
1882
Ilya Dryomov27617132015-07-16 17:36:11 +03001883static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
1884{
1885 dout("%s: obj %p\n", __func__, obj_request);
1886
1887 if (obj_request_img_data_test(obj_request))
1888 rbd_osd_copyup_callback(obj_request);
1889 else
1890 obj_request_done_set(obj_request);
1891}
1892
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001893static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001894{
1895 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001896 u16 opcode;
1897
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001898 dout("%s: osd_req %p\n", __func__, osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001899 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001900 if (obj_request_img_data_test(obj_request)) {
1901 rbd_assert(obj_request->img_request);
1902 rbd_assert(obj_request->which != BAD_WHICH);
1903 } else {
1904 rbd_assert(obj_request->which == BAD_WHICH);
1905 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001906
Sage Weil1b83bef2013-02-25 16:11:12 -08001907 if (osd_req->r_result < 0)
1908 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001909
Alex Elderc47f9372013-02-26 14:23:07 -06001910 /*
1911 * We support a 64-bit length, but ultimately it has to be
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01001912 * passed to the block layer, which just supports a 32-bit
1913 * length field.
Alex Elderc47f9372013-02-26 14:23:07 -06001914 */
Yan, Zheng7665d852016-01-07 16:48:57 +08001915 obj_request->xferred = osd_req->r_ops[0].outdata_len;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001916 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001917
Alex Elder79528732013-04-03 21:32:51 -05001918 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001919 switch (opcode) {
1920 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001921 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001922 break;
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001923 case CEPH_OSD_OP_SETALLOCHINT:
Ilya Dryomove30b7572015-10-07 17:27:17 +02001924 rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1925 osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001926 /* fall through */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001927 case CEPH_OSD_OP_WRITE:
Ilya Dryomove30b7572015-10-07 17:27:17 +02001928 case CEPH_OSD_OP_WRITEFULL:
Alex Elderc47f9372013-02-26 14:23:07 -06001929 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001930 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001931 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001932 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001933 break;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001934 case CEPH_OSD_OP_DELETE:
1935 case CEPH_OSD_OP_TRUNCATE:
1936 case CEPH_OSD_OP_ZERO:
1937 rbd_osd_discard_callback(obj_request);
1938 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001939 case CEPH_OSD_OP_CALL:
Ilya Dryomov27617132015-07-16 17:36:11 +03001940 rbd_osd_call_callback(obj_request);
1941 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001942 default:
Ilya Dryomov9584d502014-07-11 12:11:20 +04001943 rbd_warn(NULL, "%s: unsupported op %hu",
Alex Elderbf0d5f502012-11-22 00:00:08 -06001944 obj_request->object_name, (unsigned short) opcode);
1945 break;
1946 }
1947
Alex Elder07741302013-02-05 23:41:50 -06001948 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001949 rbd_obj_request_complete(obj_request);
1950}
1951
Alex Elder9d4df012013-04-19 15:34:50 -05001952static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001953{
Alex Elder8c042b02013-04-03 01:28:58 -05001954 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001955
Ilya Dryomov7c848832016-09-15 17:56:39 +02001956 rbd_assert(obj_request_img_data_test(obj_request));
1957 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001958}
1959
1960static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1961{
Alex Elder9d4df012013-04-19 15:34:50 -05001962 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001963
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001964 osd_req->r_mtime = CURRENT_TIME;
1965 osd_req->r_data_offset = obj_request->offset;
Alex Elder430c28c2013-04-03 21:32:51 -05001966}
1967
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001968/*
1969 * Create an osd request. A read request has one osd op (read).
1970 * A write request has either one (watch) or two (hint+write) osd ops.
1971 * (All rbd data writes are prefixed with an allocation hint op, but
1972 * technically osd watch is a write request, hence this distinction.)
1973 */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001974static struct ceph_osd_request *rbd_osd_req_create(
1975 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001976 enum obj_operation_type op_type,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02001977 unsigned int num_ops,
Alex Elder430c28c2013-04-03 21:32:51 -05001978 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001979{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001980 struct ceph_snap_context *snapc = NULL;
1981 struct ceph_osd_client *osdc;
1982 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001983
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001984 if (obj_request_img_data_test(obj_request) &&
1985 (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
Alex Elder6365d332013-02-11 12:33:24 -06001986 struct rbd_img_request *img_request = obj_request->img_request;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001987 if (op_type == OBJ_OP_WRITE) {
1988 rbd_assert(img_request_write_test(img_request));
1989 } else {
1990 rbd_assert(img_request_discard_test(img_request));
1991 }
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001992 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001993 }
1994
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001995 rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02001996
1997 /* Allocate and initialize the request, for the num_ops ops */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001998
1999 osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02002000 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
David Disseldorp2224d872016-04-05 11:13:39 +02002001 GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002002 if (!osd_req)
Ilya Dryomov13d1ad12016-04-27 14:15:51 +02002003 goto fail;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002004
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002005 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002006 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05002007 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06002008 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002009
2010 osd_req->r_callback = rbd_osd_req_callback;
2011 osd_req->r_priv = obj_request;
2012
Yan, Zheng76271512016-02-03 21:24:49 +08002013 osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovd30291b2016-04-29 19:54:20 +02002014 if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
2015 obj_request->object_name))
2016 goto fail;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002017
Ilya Dryomov13d1ad12016-04-27 14:15:51 +02002018 if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
2019 goto fail;
2020
Alex Elderbf0d5f502012-11-22 00:00:08 -06002021 return osd_req;
Ilya Dryomov13d1ad12016-04-27 14:15:51 +02002022
2023fail:
2024 ceph_osdc_put_request(osd_req);
2025 return NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002026}
2027
Alex Elder0eefd472013-04-19 15:34:50 -05002028/*
Josh Durgind3246fb2014-04-07 16:49:21 -07002029 * Create a copyup osd request based on the information in the object
2030 * request supplied. A copyup request has two or three osd ops, a
2031 * copyup method call, potentially a hint op, and a write or truncate
2032 * or zero op.
Alex Elder0eefd472013-04-19 15:34:50 -05002033 */
2034static struct ceph_osd_request *
2035rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
2036{
2037 struct rbd_img_request *img_request;
2038 struct ceph_snap_context *snapc;
2039 struct rbd_device *rbd_dev;
2040 struct ceph_osd_client *osdc;
2041 struct ceph_osd_request *osd_req;
Josh Durgind3246fb2014-04-07 16:49:21 -07002042 int num_osd_ops = 3;
Alex Elder0eefd472013-04-19 15:34:50 -05002043
2044 rbd_assert(obj_request_img_data_test(obj_request));
2045 img_request = obj_request->img_request;
2046 rbd_assert(img_request);
Josh Durgind3246fb2014-04-07 16:49:21 -07002047 rbd_assert(img_request_write_test(img_request) ||
2048 img_request_discard_test(img_request));
Alex Elder0eefd472013-04-19 15:34:50 -05002049
Josh Durgind3246fb2014-04-07 16:49:21 -07002050 if (img_request_discard_test(img_request))
2051 num_osd_ops = 2;
2052
2053 /* Allocate and initialize the request, for all the ops */
Alex Elder0eefd472013-04-19 15:34:50 -05002054
2055 snapc = img_request->snapc;
2056 rbd_dev = img_request->rbd_dev;
2057 osdc = &rbd_dev->rbd_client->client->osdc;
Josh Durgind3246fb2014-04-07 16:49:21 -07002058 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
David Disseldorp2224d872016-04-05 11:13:39 +02002059 false, GFP_NOIO);
Alex Elder0eefd472013-04-19 15:34:50 -05002060 if (!osd_req)
Ilya Dryomov13d1ad12016-04-27 14:15:51 +02002061 goto fail;
Alex Elder0eefd472013-04-19 15:34:50 -05002062
2063 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
2064 osd_req->r_callback = rbd_osd_req_callback;
2065 osd_req->r_priv = obj_request;
2066
Yan, Zheng76271512016-02-03 21:24:49 +08002067 osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovd30291b2016-04-29 19:54:20 +02002068 if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
2069 obj_request->object_name))
2070 goto fail;
Alex Elder0eefd472013-04-19 15:34:50 -05002071
Ilya Dryomov13d1ad12016-04-27 14:15:51 +02002072 if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
2073 goto fail;
2074
Alex Elder0eefd472013-04-19 15:34:50 -05002075 return osd_req;
Ilya Dryomov13d1ad12016-04-27 14:15:51 +02002076
2077fail:
2078 ceph_osdc_put_request(osd_req);
2079 return NULL;
Alex Elder0eefd472013-04-19 15:34:50 -05002080}
2081
2082
Alex Elderbf0d5f502012-11-22 00:00:08 -06002083static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2084{
2085 ceph_osdc_put_request(osd_req);
2086}
2087
2088/* object_name is assumed to be a non-null pointer and NUL-terminated */
2089
2090static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
2091 u64 offset, u64 length,
2092 enum obj_request_type type)
2093{
2094 struct rbd_obj_request *obj_request;
2095 size_t size;
2096 char *name;
2097
2098 rbd_assert(obj_request_type_valid(type));
2099
2100 size = strlen(object_name) + 1;
Ilya Dryomov5a60e872015-06-24 17:24:33 +03002101 name = kmalloc(size, GFP_NOIO);
Alex Elderf907ad52013-05-01 12:43:03 -05002102 if (!name)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002103 return NULL;
2104
Ilya Dryomov5a60e872015-06-24 17:24:33 +03002105 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Alex Elderf907ad52013-05-01 12:43:03 -05002106 if (!obj_request) {
2107 kfree(name);
2108 return NULL;
2109 }
2110
Alex Elderbf0d5f502012-11-22 00:00:08 -06002111 obj_request->object_name = memcpy(name, object_name, size);
2112 obj_request->offset = offset;
2113 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06002114 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002115 obj_request->which = BAD_WHICH;
2116 obj_request->type = type;
2117 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06002118 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002119 kref_init(&obj_request->kref);
2120
Alex Elder37206ee2013-02-20 17:32:08 -06002121 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
2122 offset, length, (int)type, obj_request);
2123
Alex Elderbf0d5f502012-11-22 00:00:08 -06002124 return obj_request;
2125}
2126
2127static void rbd_obj_request_destroy(struct kref *kref)
2128{
2129 struct rbd_obj_request *obj_request;
2130
2131 obj_request = container_of(kref, struct rbd_obj_request, kref);
2132
Alex Elder37206ee2013-02-20 17:32:08 -06002133 dout("%s: obj %p\n", __func__, obj_request);
2134
Alex Elderbf0d5f502012-11-22 00:00:08 -06002135 rbd_assert(obj_request->img_request == NULL);
2136 rbd_assert(obj_request->which == BAD_WHICH);
2137
2138 if (obj_request->osd_req)
2139 rbd_osd_req_destroy(obj_request->osd_req);
2140
2141 rbd_assert(obj_request_type_valid(obj_request->type));
2142 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06002143 case OBJ_REQUEST_NODATA:
2144 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06002145 case OBJ_REQUEST_BIO:
2146 if (obj_request->bio_list)
2147 bio_chain_put(obj_request->bio_list);
2148 break;
Alex Elder788e2df2013-01-17 12:25:27 -06002149 case OBJ_REQUEST_PAGES:
2150 if (obj_request->pages)
2151 ceph_release_page_vector(obj_request->pages,
2152 obj_request->page_count);
2153 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002154 }
2155
Alex Elderf907ad52013-05-01 12:43:03 -05002156 kfree(obj_request->object_name);
Alex Elder868311b2013-05-01 12:43:03 -05002157 obj_request->object_name = NULL;
2158 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002159}
2160
Alex Elderfb65d2282013-05-08 22:50:04 -05002161/* It's OK to call this for a device with no parent */
2162
2163static void rbd_spec_put(struct rbd_spec *spec);
2164static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2165{
2166 rbd_dev_remove_parent(rbd_dev);
2167 rbd_spec_put(rbd_dev->parent_spec);
2168 rbd_dev->parent_spec = NULL;
2169 rbd_dev->parent_overlap = 0;
2170}
2171
Alex Elderbf0d5f502012-11-22 00:00:08 -06002172/*
Alex Eldera2acd002013-05-08 22:50:04 -05002173 * Parent image reference counting is used to determine when an
2174 * image's parent fields can be safely torn down--after there are no
2175 * more in-flight requests to the parent image. When the last
2176 * reference is dropped, cleaning them up is safe.
2177 */
2178static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2179{
2180 int counter;
2181
2182 if (!rbd_dev->parent_spec)
2183 return;
2184
2185 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2186 if (counter > 0)
2187 return;
2188
2189 /* Last reference; clean up parent data structures */
2190
2191 if (!counter)
2192 rbd_dev_unparent(rbd_dev);
2193 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04002194 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05002195}
2196
2197/*
2198 * If an image has a non-zero parent overlap, get a reference to its
2199 * parent.
2200 *
2201 * Returns true if the rbd device has a parent with a non-zero
2202 * overlap and a reference for it was successfully taken, or
2203 * false otherwise.
2204 */
2205static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2206{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002207 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05002208
2209 if (!rbd_dev->parent_spec)
2210 return false;
2211
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002212 down_read(&rbd_dev->header_rwsem);
2213 if (rbd_dev->parent_overlap)
2214 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2215 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05002216
2217 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04002218 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05002219
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002220 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05002221}
2222
Alex Elderbf0d5f502012-11-22 00:00:08 -06002223/*
2224 * Caller is responsible for filling in the list of object requests
2225 * that comprises the image request, and the Linux request pointer
2226 * (if there is one).
2227 */
Alex Eldercc344fa2013-02-19 12:25:56 -06002228static struct rbd_img_request *rbd_img_request_create(
2229 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06002230 u64 offset, u64 length,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002231 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07002232 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002233{
2234 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002235
Ilya Dryomov7a716aa2014-08-05 11:25:54 +04002236 img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002237 if (!img_request)
2238 return NULL;
2239
Alex Elderbf0d5f502012-11-22 00:00:08 -06002240 img_request->rq = NULL;
2241 img_request->rbd_dev = rbd_dev;
2242 img_request->offset = offset;
2243 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06002244 img_request->flags = 0;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002245 if (op_type == OBJ_OP_DISCARD) {
2246 img_request_discard_set(img_request);
2247 img_request->snapc = snapc;
2248 } else if (op_type == OBJ_OP_WRITE) {
Alex Elder0c425242013-02-08 09:55:49 -06002249 img_request_write_set(img_request);
Josh Durgin4e752f02014-04-08 11:12:11 -07002250 img_request->snapc = snapc;
Alex Elder0c425242013-02-08 09:55:49 -06002251 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002252 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06002253 }
Alex Eldera2acd002013-05-08 22:50:04 -05002254 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06002255 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002256 spin_lock_init(&img_request->completion_lock);
2257 img_request->next_completion = 0;
2258 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06002259 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002260 img_request->obj_request_count = 0;
2261 INIT_LIST_HEAD(&img_request->obj_requests);
2262 kref_init(&img_request->kref);
2263
Alex Elder37206ee2013-02-20 17:32:08 -06002264 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002265 obj_op_name(op_type), offset, length, img_request);
Alex Elder37206ee2013-02-20 17:32:08 -06002266
Alex Elderbf0d5f502012-11-22 00:00:08 -06002267 return img_request;
2268}
2269
2270static void rbd_img_request_destroy(struct kref *kref)
2271{
2272 struct rbd_img_request *img_request;
2273 struct rbd_obj_request *obj_request;
2274 struct rbd_obj_request *next_obj_request;
2275
2276 img_request = container_of(kref, struct rbd_img_request, kref);
2277
Alex Elder37206ee2013-02-20 17:32:08 -06002278 dout("%s: img %p\n", __func__, img_request);
2279
Alex Elderbf0d5f502012-11-22 00:00:08 -06002280 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2281 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06002282 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002283
Alex Eldera2acd002013-05-08 22:50:04 -05002284 if (img_request_layered_test(img_request)) {
2285 img_request_layered_clear(img_request);
2286 rbd_dev_parent_put(img_request->rbd_dev);
2287 }
2288
Josh Durginbef95452014-04-04 17:47:52 -07002289 if (img_request_write_test(img_request) ||
2290 img_request_discard_test(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05002291 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002292
Alex Elder1c2a9df2013-05-01 12:43:03 -05002293 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002294}
2295
Alex Eldere93f3152013-05-08 22:50:04 -05002296static struct rbd_img_request *rbd_parent_request_create(
2297 struct rbd_obj_request *obj_request,
2298 u64 img_offset, u64 length)
2299{
2300 struct rbd_img_request *parent_request;
2301 struct rbd_device *rbd_dev;
2302
2303 rbd_assert(obj_request->img_request);
2304 rbd_dev = obj_request->img_request->rbd_dev;
2305
Josh Durgin4e752f02014-04-08 11:12:11 -07002306 parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002307 length, OBJ_OP_READ, NULL);
Alex Eldere93f3152013-05-08 22:50:04 -05002308 if (!parent_request)
2309 return NULL;
2310
2311 img_request_child_set(parent_request);
2312 rbd_obj_request_get(obj_request);
2313 parent_request->obj_request = obj_request;
2314
2315 return parent_request;
2316}
2317
2318static void rbd_parent_request_destroy(struct kref *kref)
2319{
2320 struct rbd_img_request *parent_request;
2321 struct rbd_obj_request *orig_request;
2322
2323 parent_request = container_of(kref, struct rbd_img_request, kref);
2324 orig_request = parent_request->obj_request;
2325
2326 parent_request->obj_request = NULL;
2327 rbd_obj_request_put(orig_request);
2328 img_request_child_clear(parent_request);
2329
2330 rbd_img_request_destroy(kref);
2331}
2332
Alex Elder12178572013-02-08 09:55:49 -06002333static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2334{
Alex Elder6365d332013-02-11 12:33:24 -06002335 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06002336 unsigned int xferred;
2337 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002338 bool more;
Alex Elder12178572013-02-08 09:55:49 -06002339
Alex Elder6365d332013-02-11 12:33:24 -06002340 rbd_assert(obj_request_img_data_test(obj_request));
2341 img_request = obj_request->img_request;
2342
Alex Elder12178572013-02-08 09:55:49 -06002343 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2344 xferred = (unsigned int)obj_request->xferred;
2345 result = obj_request->result;
2346 if (result) {
2347 struct rbd_device *rbd_dev = img_request->rbd_dev;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002348 enum obj_operation_type op_type;
2349
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002350 if (img_request_discard_test(img_request))
2351 op_type = OBJ_OP_DISCARD;
2352 else if (img_request_write_test(img_request))
2353 op_type = OBJ_OP_WRITE;
2354 else
2355 op_type = OBJ_OP_READ;
Alex Elder12178572013-02-08 09:55:49 -06002356
Ilya Dryomov9584d502014-07-11 12:11:20 +04002357 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002358 obj_op_name(op_type), obj_request->length,
2359 obj_request->img_offset, obj_request->offset);
Ilya Dryomov9584d502014-07-11 12:11:20 +04002360 rbd_warn(rbd_dev, " result %d xferred %x",
Alex Elder12178572013-02-08 09:55:49 -06002361 result, xferred);
2362 if (!img_request->result)
2363 img_request->result = result;
Ilya Dryomov082a75d2015-04-25 15:56:15 +03002364 /*
2365 * Need to end I/O on the entire obj_request worth of
2366 * bytes in case of error.
2367 */
2368 xferred = obj_request->length;
Alex Elder12178572013-02-08 09:55:49 -06002369 }
2370
Alex Elderf1a47392013-04-19 15:34:50 -05002371 /* Image object requests don't own their page array */
2372
2373 if (obj_request->type == OBJ_REQUEST_PAGES) {
2374 obj_request->pages = NULL;
2375 obj_request->page_count = 0;
2376 }
2377
Alex Elder8b3e1a52013-01-24 16:13:36 -06002378 if (img_request_child_test(img_request)) {
2379 rbd_assert(img_request->obj_request != NULL);
2380 more = obj_request->which < img_request->obj_request_count - 1;
2381 } else {
2382 rbd_assert(img_request->rq != NULL);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01002383
2384 more = blk_update_request(img_request->rq, result, xferred);
2385 if (!more)
2386 __blk_mq_end_request(img_request->rq, result);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002387 }
2388
2389 return more;
Alex Elder12178572013-02-08 09:55:49 -06002390}
2391
Alex Elder21692382013-04-05 01:27:12 -05002392static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2393{
2394 struct rbd_img_request *img_request;
2395 u32 which = obj_request->which;
2396 bool more = true;
2397
Alex Elder6365d332013-02-11 12:33:24 -06002398 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05002399 img_request = obj_request->img_request;
2400
2401 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2402 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05002403 rbd_assert(img_request->obj_request_count > 0);
2404 rbd_assert(which != BAD_WHICH);
2405 rbd_assert(which < img_request->obj_request_count);
Alex Elder21692382013-04-05 01:27:12 -05002406
2407 spin_lock_irq(&img_request->completion_lock);
2408 if (which != img_request->next_completion)
2409 goto out;
2410
2411 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05002412 rbd_assert(more);
2413 rbd_assert(which < img_request->obj_request_count);
2414
2415 if (!obj_request_done_test(obj_request))
2416 break;
Alex Elder12178572013-02-08 09:55:49 -06002417 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002418 which++;
2419 }
2420
2421 rbd_assert(more ^ (which == img_request->obj_request_count));
2422 img_request->next_completion = which;
2423out:
2424 spin_unlock_irq(&img_request->completion_lock);
Alex Elder0f2d5be2014-04-26 14:21:44 +04002425 rbd_img_request_put(img_request);
Alex Elder21692382013-04-05 01:27:12 -05002426
2427 if (!more)
2428 rbd_img_request_complete(img_request);
2429}
2430
Alex Elderf1a47392013-04-19 15:34:50 -05002431/*
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002432 * Add individual osd ops to the given ceph_osd_request and prepare
2433 * them for submission. num_ops is the current number of
2434 * osd operations already to the object request.
2435 */
2436static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
2437 struct ceph_osd_request *osd_request,
2438 enum obj_operation_type op_type,
2439 unsigned int num_ops)
2440{
2441 struct rbd_img_request *img_request = obj_request->img_request;
2442 struct rbd_device *rbd_dev = img_request->rbd_dev;
2443 u64 object_size = rbd_obj_bytes(&rbd_dev->header);
2444 u64 offset = obj_request->offset;
2445 u64 length = obj_request->length;
2446 u64 img_end;
2447 u16 opcode;
2448
2449 if (op_type == OBJ_OP_DISCARD) {
Josh Durgind3246fb2014-04-07 16:49:21 -07002450 if (!offset && length == object_size &&
2451 (!img_request_layered_test(img_request) ||
2452 !obj_request_overlaps_parent(obj_request))) {
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002453 opcode = CEPH_OSD_OP_DELETE;
2454 } else if ((offset + length == object_size)) {
2455 opcode = CEPH_OSD_OP_TRUNCATE;
2456 } else {
2457 down_read(&rbd_dev->header_rwsem);
2458 img_end = rbd_dev->header.image_size;
2459 up_read(&rbd_dev->header_rwsem);
2460
2461 if (obj_request->img_offset + length == img_end)
2462 opcode = CEPH_OSD_OP_TRUNCATE;
2463 else
2464 opcode = CEPH_OSD_OP_ZERO;
2465 }
2466 } else if (op_type == OBJ_OP_WRITE) {
Ilya Dryomove30b7572015-10-07 17:27:17 +02002467 if (!offset && length == object_size)
2468 opcode = CEPH_OSD_OP_WRITEFULL;
2469 else
2470 opcode = CEPH_OSD_OP_WRITE;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002471 osd_req_op_alloc_hint_init(osd_request, num_ops,
2472 object_size, object_size);
2473 num_ops++;
2474 } else {
2475 opcode = CEPH_OSD_OP_READ;
2476 }
2477
Ilya Dryomov7e868b62014-11-21 22:16:43 +03002478 if (opcode == CEPH_OSD_OP_DELETE)
Yan, Zheng144cba12015-04-27 11:09:54 +08002479 osd_req_op_init(osd_request, num_ops, opcode, 0);
Ilya Dryomov7e868b62014-11-21 22:16:43 +03002480 else
2481 osd_req_op_extent_init(osd_request, num_ops, opcode,
2482 offset, length, 0, 0);
2483
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002484 if (obj_request->type == OBJ_REQUEST_BIO)
2485 osd_req_op_extent_osd_data_bio(osd_request, num_ops,
2486 obj_request->bio_list, length);
2487 else if (obj_request->type == OBJ_REQUEST_PAGES)
2488 osd_req_op_extent_osd_data_pages(osd_request, num_ops,
2489 obj_request->pages, length,
2490 offset & ~PAGE_MASK, false, false);
2491
2492 /* Discards are also writes */
2493 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
2494 rbd_osd_req_format_write(obj_request);
2495 else
2496 rbd_osd_req_format_read(obj_request);
2497}
2498
2499/*
Alex Elderf1a47392013-04-19 15:34:50 -05002500 * Split up an image request into one or more object requests, each
2501 * to a different object. The "type" parameter indicates whether
2502 * "data_desc" is the pointer to the head of a list of bio
2503 * structures, or the base of a page array. In either case this
2504 * function assumes data_desc describes memory sufficient to hold
2505 * all data described by the image request.
2506 */
2507static int rbd_img_request_fill(struct rbd_img_request *img_request,
2508 enum obj_request_type type,
2509 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002510{
2511 struct rbd_device *rbd_dev = img_request->rbd_dev;
2512 struct rbd_obj_request *obj_request = NULL;
2513 struct rbd_obj_request *next_obj_request;
Jingoo Hana1580732013-08-09 13:04:35 +09002514 struct bio *bio_list = NULL;
Alex Elderf1a47392013-04-19 15:34:50 -05002515 unsigned int bio_offset = 0;
Jingoo Hana1580732013-08-09 13:04:35 +09002516 struct page **pages = NULL;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002517 enum obj_operation_type op_type;
Alex Elder7da22d22013-01-24 16:13:36 -06002518 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002519 u64 resid;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002520
Alex Elderf1a47392013-04-19 15:34:50 -05002521 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2522 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06002523
Alex Elder7da22d22013-01-24 16:13:36 -06002524 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002525 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06002526 rbd_assert(resid > 0);
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002527 op_type = rbd_img_request_op_type(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05002528
2529 if (type == OBJ_REQUEST_BIO) {
2530 bio_list = data_desc;
Kent Overstreet4f024f32013-10-11 15:44:27 -07002531 rbd_assert(img_offset ==
2532 bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002533 } else if (type == OBJ_REQUEST_PAGES) {
Alex Elderf1a47392013-04-19 15:34:50 -05002534 pages = data_desc;
2535 }
2536
Alex Elderbf0d5f502012-11-22 00:00:08 -06002537 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05002538 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002539 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002540 u64 offset;
2541 u64 length;
2542
Alex Elder7da22d22013-01-24 16:13:36 -06002543 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002544 if (!object_name)
2545 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06002546 offset = rbd_segment_offset(rbd_dev, img_offset);
2547 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002548 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05002549 offset, length, type);
Alex Elder78c2a442013-05-01 12:43:04 -05002550 /* object request has its own copy of the object name */
2551 rbd_segment_name_free(object_name);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002552 if (!obj_request)
2553 goto out_unwind;
Ilya Dryomov62054da2014-03-04 11:57:17 +02002554
Josh Durgin03507db2013-08-27 14:45:46 -07002555 /*
2556 * set obj_request->img_request before creating the
2557 * osd_request so that it gets the right snapc
2558 */
2559 rbd_img_obj_request_add(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002560
Alex Elderf1a47392013-04-19 15:34:50 -05002561 if (type == OBJ_REQUEST_BIO) {
2562 unsigned int clone_size;
2563
2564 rbd_assert(length <= (u64)UINT_MAX);
2565 clone_size = (unsigned int)length;
2566 obj_request->bio_list =
2567 bio_chain_clone_range(&bio_list,
2568 &bio_offset,
2569 clone_size,
David Disseldorp2224d872016-04-05 11:13:39 +02002570 GFP_NOIO);
Alex Elderf1a47392013-04-19 15:34:50 -05002571 if (!obj_request->bio_list)
Ilya Dryomov62054da2014-03-04 11:57:17 +02002572 goto out_unwind;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002573 } else if (type == OBJ_REQUEST_PAGES) {
Alex Elderf1a47392013-04-19 15:34:50 -05002574 unsigned int page_count;
2575
2576 obj_request->pages = pages;
2577 page_count = (u32)calc_pages_for(offset, length);
2578 obj_request->page_count = page_count;
2579 if ((offset + length) & ~PAGE_MASK)
2580 page_count--; /* more on last page */
2581 pages += page_count;
2582 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002583
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002584 osd_req = rbd_osd_req_create(rbd_dev, op_type,
2585 (op_type == OBJ_OP_WRITE) ? 2 : 1,
2586 obj_request);
Alex Elder2fa12322013-04-05 01:27:12 -05002587 if (!osd_req)
Ilya Dryomov62054da2014-03-04 11:57:17 +02002588 goto out_unwind;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002589
Alex Elder2fa12322013-04-05 01:27:12 -05002590 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05002591 obj_request->callback = rbd_img_obj_callback;
Alex Elder7da22d22013-01-24 16:13:36 -06002592 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002593
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002594 rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
2595
Alex Elder7da22d22013-01-24 16:13:36 -06002596 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002597 resid -= length;
2598 }
2599
2600 return 0;
2601
Alex Elderbf0d5f502012-11-22 00:00:08 -06002602out_unwind:
2603 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
Ilya Dryomov42dd0372014-03-04 11:57:17 +02002604 rbd_img_obj_request_del(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002605
2606 return -ENOMEM;
2607}
2608
Alex Elder3d7efd12013-04-19 15:34:50 -05002609static void
Ilya Dryomov27617132015-07-16 17:36:11 +03002610rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
Alex Elder0eefd472013-04-19 15:34:50 -05002611{
2612 struct rbd_img_request *img_request;
2613 struct rbd_device *rbd_dev;
Alex Elderebda6402013-05-10 16:29:22 -05002614 struct page **pages;
Alex Elder0eefd472013-04-19 15:34:50 -05002615 u32 page_count;
2616
Ilya Dryomov27617132015-07-16 17:36:11 +03002617 dout("%s: obj %p\n", __func__, obj_request);
2618
Josh Durgind3246fb2014-04-07 16:49:21 -07002619 rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2620 obj_request->type == OBJ_REQUEST_NODATA);
Alex Elder0eefd472013-04-19 15:34:50 -05002621 rbd_assert(obj_request_img_data_test(obj_request));
2622 img_request = obj_request->img_request;
2623 rbd_assert(img_request);
2624
2625 rbd_dev = img_request->rbd_dev;
2626 rbd_assert(rbd_dev);
Alex Elder0eefd472013-04-19 15:34:50 -05002627
Alex Elderebda6402013-05-10 16:29:22 -05002628 pages = obj_request->copyup_pages;
2629 rbd_assert(pages != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002630 obj_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002631 page_count = obj_request->copyup_page_count;
2632 rbd_assert(page_count);
2633 obj_request->copyup_page_count = 0;
2634 ceph_release_page_vector(pages, page_count);
Alex Elder0eefd472013-04-19 15:34:50 -05002635
2636 /*
2637 * We want the transfer count to reflect the size of the
2638 * original write request. There is no such thing as a
2639 * successful short write, so if the request was successful
2640 * we can just set it to the originally-requested length.
2641 */
2642 if (!obj_request->result)
2643 obj_request->xferred = obj_request->length;
2644
Ilya Dryomov27617132015-07-16 17:36:11 +03002645 obj_request_done_set(obj_request);
Alex Elder0eefd472013-04-19 15:34:50 -05002646}
2647
2648static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002649rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2650{
2651 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002652 struct ceph_osd_request *osd_req;
Alex Elder0eefd472013-04-19 15:34:50 -05002653 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002654 struct page **pages;
Josh Durgind3246fb2014-04-07 16:49:21 -07002655 enum obj_operation_type op_type;
Alex Elderebda6402013-05-10 16:29:22 -05002656 u32 page_count;
Alex Elderbbea1c12013-05-06 17:40:33 -05002657 int img_result;
Alex Elderebda6402013-05-10 16:29:22 -05002658 u64 parent_length;
Alex Elder3d7efd12013-04-19 15:34:50 -05002659
2660 rbd_assert(img_request_child_test(img_request));
2661
2662 /* First get what we need from the image request */
2663
2664 pages = img_request->copyup_pages;
2665 rbd_assert(pages != NULL);
2666 img_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002667 page_count = img_request->copyup_page_count;
2668 rbd_assert(page_count);
2669 img_request->copyup_page_count = 0;
Alex Elder3d7efd12013-04-19 15:34:50 -05002670
2671 orig_request = img_request->obj_request;
2672 rbd_assert(orig_request != NULL);
Alex Elderb91f09f2013-05-10 16:29:22 -05002673 rbd_assert(obj_request_type_valid(orig_request->type));
Alex Elderbbea1c12013-05-06 17:40:33 -05002674 img_result = img_request->result;
Alex Elderebda6402013-05-10 16:29:22 -05002675 parent_length = img_request->length;
Ilya Dryomovfa355112016-09-16 15:20:42 +02002676 rbd_assert(img_result || parent_length == img_request->xferred);
Alex Elder3d7efd12013-04-19 15:34:50 -05002677 rbd_img_request_put(img_request);
2678
Alex Elder91c6feb2013-05-06 17:40:32 -05002679 rbd_assert(orig_request->img_request);
2680 rbd_dev = orig_request->img_request->rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002681 rbd_assert(rbd_dev);
Alex Elder3d7efd12013-04-19 15:34:50 -05002682
Alex Elderbbea1c12013-05-06 17:40:33 -05002683 /*
2684 * If the overlap has become 0 (most likely because the
2685 * image has been flattened) we need to free the pages
2686 * and re-submit the original write request.
2687 */
2688 if (!rbd_dev->parent_overlap) {
Alex Elderbbea1c12013-05-06 17:40:33 -05002689 ceph_release_page_vector(pages, page_count);
Ilya Dryomov980917f2016-09-12 18:59:42 +02002690 rbd_obj_request_submit(orig_request);
2691 return;
Alex Elderbbea1c12013-05-06 17:40:33 -05002692 }
2693
2694 if (img_result)
Alex Elder0eefd472013-04-19 15:34:50 -05002695 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002696
Alex Elder8785b1d2013-05-09 10:08:49 -05002697 /*
2698 * The original osd request is of no use to use any more.
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002699 * We need a new one that can hold the three ops in a copyup
Alex Elder8785b1d2013-05-09 10:08:49 -05002700 * request. Allocate the new copyup osd request for the
2701 * original request, and release the old one.
2702 */
Alex Elderbbea1c12013-05-06 17:40:33 -05002703 img_result = -ENOMEM;
Alex Elder0eefd472013-04-19 15:34:50 -05002704 osd_req = rbd_osd_req_create_copyup(orig_request);
2705 if (!osd_req)
2706 goto out_err;
Alex Elder8785b1d2013-05-09 10:08:49 -05002707 rbd_osd_req_destroy(orig_request->osd_req);
Alex Elder0eefd472013-04-19 15:34:50 -05002708 orig_request->osd_req = osd_req;
2709 orig_request->copyup_pages = pages;
Alex Elderebda6402013-05-10 16:29:22 -05002710 orig_request->copyup_page_count = page_count;
Alex Elder3d7efd12013-04-19 15:34:50 -05002711
Alex Elder0eefd472013-04-19 15:34:50 -05002712 /* Initialize the copyup op */
2713
2714 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
Alex Elderebda6402013-05-10 16:29:22 -05002715 osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
Alex Elder0eefd472013-04-19 15:34:50 -05002716 false, false);
2717
Josh Durgind3246fb2014-04-07 16:49:21 -07002718 /* Add the other op(s) */
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002719
Josh Durgind3246fb2014-04-07 16:49:21 -07002720 op_type = rbd_img_request_op_type(orig_request->img_request);
2721 rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
Alex Elder0eefd472013-04-19 15:34:50 -05002722
2723 /* All set, send it off. */
2724
Ilya Dryomov980917f2016-09-12 18:59:42 +02002725 rbd_obj_request_submit(orig_request);
2726 return;
2727
Alex Elder0eefd472013-04-19 15:34:50 -05002728out_err:
Ilya Dryomovfa355112016-09-16 15:20:42 +02002729 ceph_release_page_vector(pages, page_count);
Alex Elderbbea1c12013-05-06 17:40:33 -05002730 orig_request->result = img_result;
Alex Elder0eefd472013-04-19 15:34:50 -05002731 orig_request->xferred = 0;
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02002732 rbd_img_request_get(orig_request->img_request);
Alex Elder0eefd472013-04-19 15:34:50 -05002733 obj_request_done_set(orig_request);
2734 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002735}
2736
2737/*
2738 * Read from the parent image the range of data that covers the
2739 * entire target of the given object request. This is used for
2740 * satisfying a layered image write request when the target of an
2741 * object request from the image request does not exist.
2742 *
2743 * A page array big enough to hold the returned data is allocated
2744 * and supplied to rbd_img_request_fill() as the "data descriptor."
2745 * When the read completes, this page array will be transferred to
2746 * the original object request for the copyup operation.
2747 *
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002748 * If an error occurs, it is recorded as the result of the original
2749 * object request in rbd_img_obj_exists_callback().
Alex Elder3d7efd12013-04-19 15:34:50 -05002750 */
2751static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2752{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002753 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002754 struct rbd_img_request *parent_request = NULL;
Alex Elder3d7efd12013-04-19 15:34:50 -05002755 u64 img_offset;
2756 u64 length;
2757 struct page **pages = NULL;
2758 u32 page_count;
2759 int result;
2760
Alex Elder3d7efd12013-04-19 15:34:50 -05002761 rbd_assert(rbd_dev->parent != NULL);
2762
2763 /*
2764 * Determine the byte range covered by the object in the
2765 * child image to which the original request was to be sent.
2766 */
2767 img_offset = obj_request->img_offset - obj_request->offset;
2768 length = (u64)1 << rbd_dev->header.obj_order;
2769
2770 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002771 * There is no defined parent data beyond the parent
2772 * overlap, so limit what we read at that boundary if
2773 * necessary.
2774 */
2775 if (img_offset + length > rbd_dev->parent_overlap) {
2776 rbd_assert(img_offset < rbd_dev->parent_overlap);
2777 length = rbd_dev->parent_overlap - img_offset;
2778 }
2779
2780 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002781 * Allocate a page array big enough to receive the data read
2782 * from the parent.
2783 */
2784 page_count = (u32)calc_pages_for(0, length);
2785 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2786 if (IS_ERR(pages)) {
2787 result = PTR_ERR(pages);
2788 pages = NULL;
2789 goto out_err;
2790 }
2791
2792 result = -ENOMEM;
Alex Eldere93f3152013-05-08 22:50:04 -05002793 parent_request = rbd_parent_request_create(obj_request,
2794 img_offset, length);
Alex Elder3d7efd12013-04-19 15:34:50 -05002795 if (!parent_request)
2796 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002797
2798 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2799 if (result)
2800 goto out_err;
Ilya Dryomov058aa992016-09-12 14:44:45 +02002801
Alex Elder3d7efd12013-04-19 15:34:50 -05002802 parent_request->copyup_pages = pages;
Alex Elderebda6402013-05-10 16:29:22 -05002803 parent_request->copyup_page_count = page_count;
Alex Elder3d7efd12013-04-19 15:34:50 -05002804 parent_request->callback = rbd_img_obj_parent_read_full_callback;
Ilya Dryomov058aa992016-09-12 14:44:45 +02002805
Alex Elder3d7efd12013-04-19 15:34:50 -05002806 result = rbd_img_request_submit(parent_request);
2807 if (!result)
2808 return 0;
2809
2810 parent_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002811 parent_request->copyup_page_count = 0;
Alex Elder3d7efd12013-04-19 15:34:50 -05002812 parent_request->obj_request = NULL;
2813 rbd_obj_request_put(obj_request);
2814out_err:
2815 if (pages)
2816 ceph_release_page_vector(pages, page_count);
2817 if (parent_request)
2818 rbd_img_request_put(parent_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002819 return result;
2820}
2821
Alex Elderc5b5ef62013-02-11 12:33:24 -06002822static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2823{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002824 struct rbd_obj_request *orig_request;
Alex Elder638f5ab2013-05-06 17:40:33 -05002825 struct rbd_device *rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002826 int result;
2827
2828 rbd_assert(!obj_request_img_data_test(obj_request));
2829
2830 /*
2831 * All we need from the object request is the original
2832 * request and the result of the STAT op. Grab those, then
2833 * we're done with the request.
2834 */
2835 orig_request = obj_request->obj_request;
2836 obj_request->obj_request = NULL;
Alex Elder912c3172013-05-13 20:35:38 -05002837 rbd_obj_request_put(orig_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002838 rbd_assert(orig_request);
2839 rbd_assert(orig_request->img_request);
2840
2841 result = obj_request->result;
2842 obj_request->result = 0;
2843
2844 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2845 obj_request, orig_request, result,
2846 obj_request->xferred, obj_request->length);
2847 rbd_obj_request_put(obj_request);
2848
Alex Elder638f5ab2013-05-06 17:40:33 -05002849 /*
2850 * If the overlap has become 0 (most likely because the
Ilya Dryomov980917f2016-09-12 18:59:42 +02002851 * image has been flattened) we need to re-submit the
2852 * original request.
Alex Elder638f5ab2013-05-06 17:40:33 -05002853 */
2854 rbd_dev = orig_request->img_request->rbd_dev;
2855 if (!rbd_dev->parent_overlap) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02002856 rbd_obj_request_submit(orig_request);
2857 return;
Alex Elder638f5ab2013-05-06 17:40:33 -05002858 }
Alex Elderc5b5ef62013-02-11 12:33:24 -06002859
2860 /*
2861 * Our only purpose here is to determine whether the object
2862 * exists, and we don't want to treat the non-existence as
2863 * an error. If something else comes back, transfer the
2864 * error to the original request and complete it now.
2865 */
2866 if (!result) {
2867 obj_request_existence_set(orig_request, true);
2868 } else if (result == -ENOENT) {
2869 obj_request_existence_set(orig_request, false);
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002870 } else {
2871 goto fail_orig_request;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002872 }
2873
2874 /*
2875 * Resubmit the original request now that we have recorded
2876 * whether the target object exists.
2877 */
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002878 result = rbd_img_obj_request_submit(orig_request);
2879 if (result)
2880 goto fail_orig_request;
2881
2882 return;
2883
2884fail_orig_request:
2885 orig_request->result = result;
2886 orig_request->xferred = 0;
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02002887 rbd_img_request_get(orig_request->img_request);
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002888 obj_request_done_set(orig_request);
2889 rbd_obj_request_complete(orig_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002890}
2891
2892static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2893{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002894 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002895 struct rbd_obj_request *stat_request;
Ilya Dryomov710214e2016-09-15 17:53:32 +02002896 struct page **pages;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002897 u32 page_count;
2898 size_t size;
2899 int ret;
2900
Ilya Dryomov710214e2016-09-15 17:53:32 +02002901 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2902 OBJ_REQUEST_PAGES);
2903 if (!stat_request)
2904 return -ENOMEM;
2905
2906 stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2907 stat_request);
2908 if (!stat_request->osd_req) {
2909 ret = -ENOMEM;
2910 goto fail_stat_request;
2911 }
2912
Alex Elderc5b5ef62013-02-11 12:33:24 -06002913 /*
2914 * The response data for a STAT call consists of:
2915 * le64 length;
2916 * struct {
2917 * le32 tv_sec;
2918 * le32 tv_nsec;
2919 * } mtime;
2920 */
2921 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2922 page_count = (u32)calc_pages_for(0, size);
2923 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
Ilya Dryomov710214e2016-09-15 17:53:32 +02002924 if (IS_ERR(pages)) {
2925 ret = PTR_ERR(pages);
2926 goto fail_stat_request;
2927 }
Alex Elderc5b5ef62013-02-11 12:33:24 -06002928
Ilya Dryomov710214e2016-09-15 17:53:32 +02002929 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2930 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2931 false, false);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002932
2933 rbd_obj_request_get(obj_request);
2934 stat_request->obj_request = obj_request;
2935 stat_request->pages = pages;
2936 stat_request->page_count = page_count;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002937 stat_request->callback = rbd_img_obj_exists_callback;
2938
Ilya Dryomov980917f2016-09-12 18:59:42 +02002939 rbd_obj_request_submit(stat_request);
2940 return 0;
2941
Ilya Dryomov710214e2016-09-15 17:53:32 +02002942fail_stat_request:
2943 rbd_obj_request_put(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002944 return ret;
2945}
2946
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002947static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
Alex Elderb454e362013-04-19 15:34:50 -05002948{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002949 struct rbd_img_request *img_request = obj_request->img_request;
2950 struct rbd_device *rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002951
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002952 /* Reads */
Josh Durgin1c220882014-04-04 17:49:12 -07002953 if (!img_request_write_test(img_request) &&
2954 !img_request_discard_test(img_request))
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002955 return true;
Alex Elderb454e362013-04-19 15:34:50 -05002956
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002957 /* Non-layered writes */
2958 if (!img_request_layered_test(img_request))
2959 return true;
2960
2961 /*
2962 * Layered writes outside of the parent overlap range don't
2963 * share any data with the parent.
2964 */
2965 if (!obj_request_overlaps_parent(obj_request))
2966 return true;
2967
2968 /*
Guangliang Zhaoc622d222014-04-01 22:22:15 +08002969 * Entire-object layered writes - we will overwrite whatever
2970 * parent data there is anyway.
2971 */
2972 if (!obj_request->offset &&
2973 obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2974 return true;
2975
2976 /*
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002977 * If the object is known to already exist, its parent data has
2978 * already been copied.
2979 */
2980 if (obj_request_known_test(obj_request) &&
2981 obj_request_exists_test(obj_request))
2982 return true;
2983
2984 return false;
2985}
2986
2987static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2988{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002989 rbd_assert(obj_request_img_data_test(obj_request));
2990 rbd_assert(obj_request_type_valid(obj_request->type));
2991 rbd_assert(obj_request->img_request);
2992
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002993 if (img_obj_request_simple(obj_request)) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02002994 rbd_obj_request_submit(obj_request);
2995 return 0;
Alex Elderb454e362013-04-19 15:34:50 -05002996 }
2997
2998 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002999 * It's a layered write. The target object might exist but
3000 * we may not know that yet. If we know it doesn't exist,
3001 * start by reading the data for the full target object from
3002 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05003003 */
Ilya Dryomov70d045f2014-09-12 16:02:01 +04003004 if (obj_request_known_test(obj_request))
Alex Elder3d7efd12013-04-19 15:34:50 -05003005 return rbd_img_obj_parent_read_full(obj_request);
3006
3007 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05003008
3009 return rbd_img_obj_exists_submit(obj_request);
3010}
3011
Alex Elderbf0d5f502012-11-22 00:00:08 -06003012static int rbd_img_request_submit(struct rbd_img_request *img_request)
3013{
Alex Elderbf0d5f502012-11-22 00:00:08 -06003014 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05003015 struct rbd_obj_request *next_obj_request;
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02003016 int ret = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003017
Alex Elder37206ee2013-02-20 17:32:08 -06003018 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06003019
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02003020 rbd_img_request_get(img_request);
3021 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderb454e362013-04-19 15:34:50 -05003022 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06003023 if (ret)
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02003024 goto out_put_ireq;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003025 }
3026
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02003027out_put_ireq:
3028 rbd_img_request_put(img_request);
3029 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003030}
3031
Alex Elder8b3e1a52013-01-24 16:13:36 -06003032static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
3033{
3034 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003035 struct rbd_device *rbd_dev;
3036 u64 obj_end;
Alex Elder02c74fb2013-05-06 17:40:33 -05003037 u64 img_xferred;
3038 int img_result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06003039
3040 rbd_assert(img_request_child_test(img_request));
3041
Alex Elder02c74fb2013-05-06 17:40:33 -05003042 /* First get what we need from the image request and release it */
3043
Alex Elder8b3e1a52013-01-24 16:13:36 -06003044 obj_request = img_request->obj_request;
Alex Elder02c74fb2013-05-06 17:40:33 -05003045 img_xferred = img_request->xferred;
3046 img_result = img_request->result;
3047 rbd_img_request_put(img_request);
3048
3049 /*
3050 * If the overlap has become 0 (most likely because the
3051 * image has been flattened) we need to re-submit the
3052 * original request.
3053 */
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003054 rbd_assert(obj_request);
3055 rbd_assert(obj_request->img_request);
Alex Elder02c74fb2013-05-06 17:40:33 -05003056 rbd_dev = obj_request->img_request->rbd_dev;
3057 if (!rbd_dev->parent_overlap) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02003058 rbd_obj_request_submit(obj_request);
3059 return;
Alex Elder02c74fb2013-05-06 17:40:33 -05003060 }
3061
3062 obj_request->result = img_result;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003063 if (obj_request->result)
3064 goto out;
3065
3066 /*
3067 * We need to zero anything beyond the parent overlap
3068 * boundary. Since rbd_img_obj_request_read_callback()
3069 * will zero anything beyond the end of a short read, an
3070 * easy way to do this is to pretend the data from the
3071 * parent came up short--ending at the overlap boundary.
3072 */
3073 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
3074 obj_end = obj_request->img_offset + obj_request->length;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003075 if (obj_end > rbd_dev->parent_overlap) {
3076 u64 xferred = 0;
3077
3078 if (obj_request->img_offset < rbd_dev->parent_overlap)
3079 xferred = rbd_dev->parent_overlap -
3080 obj_request->img_offset;
3081
Alex Elder02c74fb2013-05-06 17:40:33 -05003082 obj_request->xferred = min(img_xferred, xferred);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003083 } else {
Alex Elder02c74fb2013-05-06 17:40:33 -05003084 obj_request->xferred = img_xferred;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003085 }
3086out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06003087 rbd_img_obj_request_read_callback(obj_request);
3088 rbd_obj_request_complete(obj_request);
3089}
3090
3091static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
3092{
Alex Elder8b3e1a52013-01-24 16:13:36 -06003093 struct rbd_img_request *img_request;
3094 int result;
3095
3096 rbd_assert(obj_request_img_data_test(obj_request));
3097 rbd_assert(obj_request->img_request != NULL);
3098 rbd_assert(obj_request->result == (s32) -ENOENT);
Alex Elder5b2ab722013-05-06 17:40:33 -05003099 rbd_assert(obj_request_type_valid(obj_request->type));
Alex Elder8b3e1a52013-01-24 16:13:36 -06003100
Alex Elder8b3e1a52013-01-24 16:13:36 -06003101 /* rbd_read_finish(obj_request, obj_request->length); */
Alex Eldere93f3152013-05-08 22:50:04 -05003102 img_request = rbd_parent_request_create(obj_request,
Alex Elder8b3e1a52013-01-24 16:13:36 -06003103 obj_request->img_offset,
Alex Eldere93f3152013-05-08 22:50:04 -05003104 obj_request->length);
Alex Elder8b3e1a52013-01-24 16:13:36 -06003105 result = -ENOMEM;
3106 if (!img_request)
3107 goto out_err;
3108
Alex Elder5b2ab722013-05-06 17:40:33 -05003109 if (obj_request->type == OBJ_REQUEST_BIO)
3110 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3111 obj_request->bio_list);
3112 else
3113 result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
3114 obj_request->pages);
Alex Elder8b3e1a52013-01-24 16:13:36 -06003115 if (result)
3116 goto out_err;
3117
3118 img_request->callback = rbd_img_parent_read_callback;
3119 result = rbd_img_request_submit(img_request);
3120 if (result)
3121 goto out_err;
3122
3123 return;
3124out_err:
3125 if (img_request)
3126 rbd_img_request_put(img_request);
3127 obj_request->result = result;
3128 obj_request->xferred = 0;
3129 obj_request_done_set(obj_request);
3130}
3131
Ilya Dryomoved95b212016-08-12 16:40:02 +02003132static const struct rbd_client_id rbd_empty_cid;
3133
3134static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3135 const struct rbd_client_id *rhs)
3136{
3137 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3138}
3139
3140static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3141{
3142 struct rbd_client_id cid;
3143
3144 mutex_lock(&rbd_dev->watch_mutex);
3145 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3146 cid.handle = rbd_dev->watch_cookie;
3147 mutex_unlock(&rbd_dev->watch_mutex);
3148 return cid;
3149}
3150
3151/*
3152 * lock_rwsem must be held for write
3153 */
3154static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3155 const struct rbd_client_id *cid)
3156{
3157 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3158 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3159 cid->gid, cid->handle);
3160 rbd_dev->owner_cid = *cid; /* struct */
3161}
3162
3163static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3164{
3165 mutex_lock(&rbd_dev->watch_mutex);
3166 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3167 mutex_unlock(&rbd_dev->watch_mutex);
3168}
3169
3170/*
3171 * lock_rwsem must be held for write
3172 */
3173static int rbd_lock(struct rbd_device *rbd_dev)
3174{
3175 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3176 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3177 char cookie[32];
3178 int ret;
3179
3180 WARN_ON(__rbd_is_lock_owner(rbd_dev));
3181
3182 format_lock_cookie(rbd_dev, cookie);
3183 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3184 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3185 RBD_LOCK_TAG, "", 0);
3186 if (ret)
3187 return ret;
3188
3189 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3190 rbd_set_owner_cid(rbd_dev, &cid);
3191 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3192 return 0;
3193}
3194
3195/*
3196 * lock_rwsem must be held for write
3197 */
3198static int rbd_unlock(struct rbd_device *rbd_dev)
3199{
3200 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3201 char cookie[32];
3202 int ret;
3203
3204 WARN_ON(!__rbd_is_lock_owner(rbd_dev));
3205
3206 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3207
3208 format_lock_cookie(rbd_dev, cookie);
3209 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3210 RBD_LOCK_NAME, cookie);
3211 if (ret && ret != -ENOENT) {
3212 rbd_warn(rbd_dev, "cls_unlock failed: %d", ret);
3213 return ret;
3214 }
3215
3216 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3217 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3218 return 0;
3219}
3220
3221static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3222 enum rbd_notify_op notify_op,
3223 struct page ***preply_pages,
3224 size_t *preply_len)
3225{
3226 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3227 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3228 int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
3229 char buf[buf_size];
3230 void *p = buf;
3231
3232 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3233
3234 /* encode *LockPayload NotifyMessage (op + ClientId) */
3235 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3236 ceph_encode_32(&p, notify_op);
3237 ceph_encode_64(&p, cid.gid);
3238 ceph_encode_64(&p, cid.handle);
3239
3240 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3241 &rbd_dev->header_oloc, buf, buf_size,
3242 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3243}
3244
3245static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3246 enum rbd_notify_op notify_op)
3247{
3248 struct page **reply_pages;
3249 size_t reply_len;
3250
3251 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3252 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3253}
3254
3255static void rbd_notify_acquired_lock(struct work_struct *work)
3256{
3257 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3258 acquired_lock_work);
3259
3260 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3261}
3262
3263static void rbd_notify_released_lock(struct work_struct *work)
3264{
3265 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3266 released_lock_work);
3267
3268 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3269}
3270
3271static int rbd_request_lock(struct rbd_device *rbd_dev)
3272{
3273 struct page **reply_pages;
3274 size_t reply_len;
3275 bool lock_owner_responded = false;
3276 int ret;
3277
3278 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3279
3280 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3281 &reply_pages, &reply_len);
3282 if (ret && ret != -ETIMEDOUT) {
3283 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3284 goto out;
3285 }
3286
3287 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3288 void *p = page_address(reply_pages[0]);
3289 void *const end = p + reply_len;
3290 u32 n;
3291
3292 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3293 while (n--) {
3294 u8 struct_v;
3295 u32 len;
3296
3297 ceph_decode_need(&p, end, 8 + 8, e_inval);
3298 p += 8 + 8; /* skip gid and cookie */
3299
3300 ceph_decode_32_safe(&p, end, len, e_inval);
3301 if (!len)
3302 continue;
3303
3304 if (lock_owner_responded) {
3305 rbd_warn(rbd_dev,
3306 "duplicate lock owners detected");
3307 ret = -EIO;
3308 goto out;
3309 }
3310
3311 lock_owner_responded = true;
3312 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3313 &struct_v, &len);
3314 if (ret) {
3315 rbd_warn(rbd_dev,
3316 "failed to decode ResponseMessage: %d",
3317 ret);
3318 goto e_inval;
3319 }
3320
3321 ret = ceph_decode_32(&p);
3322 }
3323 }
3324
3325 if (!lock_owner_responded) {
3326 rbd_warn(rbd_dev, "no lock owners detected");
3327 ret = -ETIMEDOUT;
3328 }
3329
3330out:
3331 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3332 return ret;
3333
3334e_inval:
3335 ret = -EINVAL;
3336 goto out;
3337}
3338
3339static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3340{
3341 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3342
3343 cancel_delayed_work(&rbd_dev->lock_dwork);
3344 if (wake_all)
3345 wake_up_all(&rbd_dev->lock_waitq);
3346 else
3347 wake_up(&rbd_dev->lock_waitq);
3348}
3349
3350static int get_lock_owner_info(struct rbd_device *rbd_dev,
3351 struct ceph_locker **lockers, u32 *num_lockers)
3352{
3353 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3354 u8 lock_type;
3355 char *lock_tag;
3356 int ret;
3357
3358 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3359
3360 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3361 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3362 &lock_type, &lock_tag, lockers, num_lockers);
3363 if (ret)
3364 return ret;
3365
3366 if (*num_lockers == 0) {
3367 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3368 goto out;
3369 }
3370
3371 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3372 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3373 lock_tag);
3374 ret = -EBUSY;
3375 goto out;
3376 }
3377
3378 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3379 rbd_warn(rbd_dev, "shared lock type detected");
3380 ret = -EBUSY;
3381 goto out;
3382 }
3383
3384 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3385 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3386 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3387 (*lockers)[0].id.cookie);
3388 ret = -EBUSY;
3389 goto out;
3390 }
3391
3392out:
3393 kfree(lock_tag);
3394 return ret;
3395}
3396
3397static int find_watcher(struct rbd_device *rbd_dev,
3398 const struct ceph_locker *locker)
3399{
3400 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3401 struct ceph_watch_item *watchers;
3402 u32 num_watchers;
3403 u64 cookie;
3404 int i;
3405 int ret;
3406
3407 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3408 &rbd_dev->header_oloc, &watchers,
3409 &num_watchers);
3410 if (ret)
3411 return ret;
3412
3413 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3414 for (i = 0; i < num_watchers; i++) {
3415 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3416 sizeof(locker->info.addr)) &&
3417 watchers[i].cookie == cookie) {
3418 struct rbd_client_id cid = {
3419 .gid = le64_to_cpu(watchers[i].name.num),
3420 .handle = cookie,
3421 };
3422
3423 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3424 rbd_dev, cid.gid, cid.handle);
3425 rbd_set_owner_cid(rbd_dev, &cid);
3426 ret = 1;
3427 goto out;
3428 }
3429 }
3430
3431 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3432 ret = 0;
3433out:
3434 kfree(watchers);
3435 return ret;
3436}
3437
3438/*
3439 * lock_rwsem must be held for write
3440 */
3441static int rbd_try_lock(struct rbd_device *rbd_dev)
3442{
3443 struct ceph_client *client = rbd_dev->rbd_client->client;
3444 struct ceph_locker *lockers;
3445 u32 num_lockers;
3446 int ret;
3447
3448 for (;;) {
3449 ret = rbd_lock(rbd_dev);
3450 if (ret != -EBUSY)
3451 return ret;
3452
3453 /* determine if the current lock holder is still alive */
3454 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3455 if (ret)
3456 return ret;
3457
3458 if (num_lockers == 0)
3459 goto again;
3460
3461 ret = find_watcher(rbd_dev, lockers);
3462 if (ret) {
3463 if (ret > 0)
3464 ret = 0; /* have to request lock */
3465 goto out;
3466 }
3467
3468 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3469 ENTITY_NAME(lockers[0].id.name));
3470
3471 ret = ceph_monc_blacklist_add(&client->monc,
3472 &lockers[0].info.addr);
3473 if (ret) {
3474 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3475 ENTITY_NAME(lockers[0].id.name), ret);
3476 goto out;
3477 }
3478
3479 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3480 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3481 lockers[0].id.cookie,
3482 &lockers[0].id.name);
3483 if (ret && ret != -ENOENT)
3484 goto out;
3485
3486again:
3487 ceph_free_lockers(lockers, num_lockers);
3488 }
3489
3490out:
3491 ceph_free_lockers(lockers, num_lockers);
3492 return ret;
3493}
3494
3495/*
3496 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3497 */
3498static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3499 int *pret)
3500{
3501 enum rbd_lock_state lock_state;
3502
3503 down_read(&rbd_dev->lock_rwsem);
3504 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3505 rbd_dev->lock_state);
3506 if (__rbd_is_lock_owner(rbd_dev)) {
3507 lock_state = rbd_dev->lock_state;
3508 up_read(&rbd_dev->lock_rwsem);
3509 return lock_state;
3510 }
3511
3512 up_read(&rbd_dev->lock_rwsem);
3513 down_write(&rbd_dev->lock_rwsem);
3514 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3515 rbd_dev->lock_state);
3516 if (!__rbd_is_lock_owner(rbd_dev)) {
3517 *pret = rbd_try_lock(rbd_dev);
3518 if (*pret)
3519 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3520 }
3521
3522 lock_state = rbd_dev->lock_state;
3523 up_write(&rbd_dev->lock_rwsem);
3524 return lock_state;
3525}
3526
3527static void rbd_acquire_lock(struct work_struct *work)
3528{
3529 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3530 struct rbd_device, lock_dwork);
3531 enum rbd_lock_state lock_state;
3532 int ret;
3533
3534 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3535again:
3536 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3537 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3538 if (lock_state == RBD_LOCK_STATE_LOCKED)
3539 wake_requests(rbd_dev, true);
3540 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3541 rbd_dev, lock_state, ret);
3542 return;
3543 }
3544
3545 ret = rbd_request_lock(rbd_dev);
3546 if (ret == -ETIMEDOUT) {
3547 goto again; /* treat this as a dead client */
3548 } else if (ret < 0) {
3549 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3550 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3551 RBD_RETRY_DELAY);
3552 } else {
3553 /*
3554 * lock owner acked, but resend if we don't see them
3555 * release the lock
3556 */
3557 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3558 rbd_dev);
3559 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3560 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3561 }
3562}
3563
3564/*
3565 * lock_rwsem must be held for write
3566 */
3567static bool rbd_release_lock(struct rbd_device *rbd_dev)
3568{
3569 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3570 rbd_dev->lock_state);
3571 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3572 return false;
3573
3574 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3575 downgrade_write(&rbd_dev->lock_rwsem);
3576 /*
3577 * Ensure that all in-flight IO is flushed.
3578 *
3579 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3580 * may be shared with other devices.
3581 */
3582 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3583 up_read(&rbd_dev->lock_rwsem);
3584
3585 down_write(&rbd_dev->lock_rwsem);
3586 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3587 rbd_dev->lock_state);
3588 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3589 return false;
3590
3591 if (!rbd_unlock(rbd_dev))
3592 /*
3593 * Give others a chance to grab the lock - we would re-acquire
3594 * almost immediately if we got new IO during ceph_osdc_sync()
3595 * otherwise. We need to ack our own notifications, so this
3596 * lock_dwork will be requeued from rbd_wait_state_locked()
3597 * after wake_requests() in rbd_handle_released_lock().
3598 */
3599 cancel_delayed_work(&rbd_dev->lock_dwork);
3600
3601 return true;
3602}
3603
3604static void rbd_release_lock_work(struct work_struct *work)
3605{
3606 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3607 unlock_work);
3608
3609 down_write(&rbd_dev->lock_rwsem);
3610 rbd_release_lock(rbd_dev);
3611 up_write(&rbd_dev->lock_rwsem);
3612}
3613
3614static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3615 void **p)
3616{
3617 struct rbd_client_id cid = { 0 };
3618
3619 if (struct_v >= 2) {
3620 cid.gid = ceph_decode_64(p);
3621 cid.handle = ceph_decode_64(p);
3622 }
3623
3624 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3625 cid.handle);
3626 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3627 down_write(&rbd_dev->lock_rwsem);
3628 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3629 /*
3630 * we already know that the remote client is
3631 * the owner
3632 */
3633 up_write(&rbd_dev->lock_rwsem);
3634 return;
3635 }
3636
3637 rbd_set_owner_cid(rbd_dev, &cid);
3638 downgrade_write(&rbd_dev->lock_rwsem);
3639 } else {
3640 down_read(&rbd_dev->lock_rwsem);
3641 }
3642
3643 if (!__rbd_is_lock_owner(rbd_dev))
3644 wake_requests(rbd_dev, false);
3645 up_read(&rbd_dev->lock_rwsem);
3646}
3647
3648static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3649 void **p)
3650{
3651 struct rbd_client_id cid = { 0 };
3652
3653 if (struct_v >= 2) {
3654 cid.gid = ceph_decode_64(p);
3655 cid.handle = ceph_decode_64(p);
3656 }
3657
3658 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3659 cid.handle);
3660 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3661 down_write(&rbd_dev->lock_rwsem);
3662 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3663 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3664 __func__, rbd_dev, cid.gid, cid.handle,
3665 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3666 up_write(&rbd_dev->lock_rwsem);
3667 return;
3668 }
3669
3670 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3671 downgrade_write(&rbd_dev->lock_rwsem);
3672 } else {
3673 down_read(&rbd_dev->lock_rwsem);
3674 }
3675
3676 if (!__rbd_is_lock_owner(rbd_dev))
3677 wake_requests(rbd_dev, false);
3678 up_read(&rbd_dev->lock_rwsem);
3679}
3680
3681static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3682 void **p)
3683{
3684 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3685 struct rbd_client_id cid = { 0 };
3686 bool need_to_send;
3687
3688 if (struct_v >= 2) {
3689 cid.gid = ceph_decode_64(p);
3690 cid.handle = ceph_decode_64(p);
3691 }
3692
3693 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3694 cid.handle);
3695 if (rbd_cid_equal(&cid, &my_cid))
3696 return false;
3697
3698 down_read(&rbd_dev->lock_rwsem);
3699 need_to_send = __rbd_is_lock_owner(rbd_dev);
3700 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3701 if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) {
3702 dout("%s rbd_dev %p queueing unlock_work\n", __func__,
3703 rbd_dev);
3704 queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work);
3705 }
3706 }
3707 up_read(&rbd_dev->lock_rwsem);
3708 return need_to_send;
3709}
3710
3711static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3712 u64 notify_id, u64 cookie, s32 *result)
3713{
3714 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3715 int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3716 char buf[buf_size];
3717 int ret;
3718
3719 if (result) {
3720 void *p = buf;
3721
3722 /* encode ResponseMessage */
3723 ceph_start_encoding(&p, 1, 1,
3724 buf_size - CEPH_ENCODING_START_BLK_LEN);
3725 ceph_encode_32(&p, *result);
3726 } else {
3727 buf_size = 0;
3728 }
3729
3730 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3731 &rbd_dev->header_oloc, notify_id, cookie,
3732 buf, buf_size);
3733 if (ret)
3734 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3735}
3736
3737static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3738 u64 cookie)
3739{
3740 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3741 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3742}
3743
3744static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3745 u64 notify_id, u64 cookie, s32 result)
3746{
3747 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3748 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3749}
3750
Ilya Dryomov922dab62016-05-26 01:15:02 +02003751static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3752 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003753{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003754 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003755 void *p = data;
3756 void *const end = p + data_len;
3757 u8 struct_v;
3758 u32 len;
3759 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003760 int ret;
3761
Ilya Dryomoved95b212016-08-12 16:40:02 +02003762 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3763 __func__, rbd_dev, cookie, notify_id, data_len);
3764 if (data_len) {
3765 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3766 &struct_v, &len);
3767 if (ret) {
3768 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3769 ret);
3770 return;
3771 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003772
Ilya Dryomoved95b212016-08-12 16:40:02 +02003773 notify_op = ceph_decode_32(&p);
3774 } else {
3775 /* legacy notification for header updates */
3776 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3777 len = 0;
3778 }
Alex Elderb8d70032012-11-30 17:53:04 -06003779
Ilya Dryomoved95b212016-08-12 16:40:02 +02003780 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3781 switch (notify_op) {
3782 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3783 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3784 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3785 break;
3786 case RBD_NOTIFY_OP_RELEASED_LOCK:
3787 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3788 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3789 break;
3790 case RBD_NOTIFY_OP_REQUEST_LOCK:
3791 if (rbd_handle_request_lock(rbd_dev, struct_v, &p))
3792 /*
3793 * send ResponseMessage(0) back so the client
3794 * can detect a missing owner
3795 */
3796 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3797 cookie, 0);
3798 else
3799 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3800 break;
3801 case RBD_NOTIFY_OP_HEADER_UPDATE:
3802 ret = rbd_dev_refresh(rbd_dev);
3803 if (ret)
3804 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3805
3806 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3807 break;
3808 default:
3809 if (rbd_is_lock_owner(rbd_dev))
3810 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3811 cookie, -EOPNOTSUPP);
3812 else
3813 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3814 break;
3815 }
Alex Elderb8d70032012-11-30 17:53:04 -06003816}
3817
Ilya Dryomov99d16942016-08-12 16:11:41 +02003818static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3819
Ilya Dryomov922dab62016-05-26 01:15:02 +02003820static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003821{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003822 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003823
Ilya Dryomov922dab62016-05-26 01:15:02 +02003824 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003825
Ilya Dryomoved95b212016-08-12 16:40:02 +02003826 down_write(&rbd_dev->lock_rwsem);
3827 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3828 up_write(&rbd_dev->lock_rwsem);
3829
Ilya Dryomov99d16942016-08-12 16:11:41 +02003830 mutex_lock(&rbd_dev->watch_mutex);
3831 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3832 __rbd_unregister_watch(rbd_dev);
3833 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003834
Ilya Dryomov99d16942016-08-12 16:11:41 +02003835 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003836 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003837 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003838}
3839
3840/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003841 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003842 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003843static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003844{
3845 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003846 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003847
Ilya Dryomov922dab62016-05-26 01:15:02 +02003848 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003849 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003850
Ilya Dryomov922dab62016-05-26 01:15:02 +02003851 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3852 &rbd_dev->header_oloc, rbd_watch_cb,
3853 rbd_watch_errcb, rbd_dev);
3854 if (IS_ERR(handle))
3855 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003856
Ilya Dryomov922dab62016-05-26 01:15:02 +02003857 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003858 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003859}
3860
Ilya Dryomov99d16942016-08-12 16:11:41 +02003861/*
3862 * watch_mutex must be locked
3863 */
3864static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003865{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003866 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3867 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003868
Ilya Dryomov99d16942016-08-12 16:11:41 +02003869 rbd_assert(rbd_dev->watch_handle);
3870 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003871
Ilya Dryomov922dab62016-05-26 01:15:02 +02003872 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3873 if (ret)
3874 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003875
Ilya Dryomov922dab62016-05-26 01:15:02 +02003876 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003877}
3878
Ilya Dryomov99d16942016-08-12 16:11:41 +02003879static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003880{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003881 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003882
Ilya Dryomov99d16942016-08-12 16:11:41 +02003883 mutex_lock(&rbd_dev->watch_mutex);
3884 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3885 ret = __rbd_register_watch(rbd_dev);
3886 if (ret)
3887 goto out;
3888
3889 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3890 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3891
3892out:
3893 mutex_unlock(&rbd_dev->watch_mutex);
3894 return ret;
3895}
3896
3897static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3898{
3899 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3900
3901 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003902 cancel_work_sync(&rbd_dev->acquired_lock_work);
3903 cancel_work_sync(&rbd_dev->released_lock_work);
3904 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3905 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003906}
3907
3908static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3909{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003910 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003911 cancel_tasks_sync(rbd_dev);
3912
3913 mutex_lock(&rbd_dev->watch_mutex);
3914 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3915 __rbd_unregister_watch(rbd_dev);
3916 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3917 mutex_unlock(&rbd_dev->watch_mutex);
3918
Ilya Dryomov811c6682016-04-15 16:22:16 +02003919 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003920}
3921
Ilya Dryomov99d16942016-08-12 16:11:41 +02003922static void rbd_reregister_watch(struct work_struct *work)
3923{
3924 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3925 struct rbd_device, watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003926 bool was_lock_owner = false;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003927 int ret;
3928
3929 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3930
Ilya Dryomoved95b212016-08-12 16:40:02 +02003931 down_write(&rbd_dev->lock_rwsem);
3932 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3933 was_lock_owner = rbd_release_lock(rbd_dev);
3934
Ilya Dryomov99d16942016-08-12 16:11:41 +02003935 mutex_lock(&rbd_dev->watch_mutex);
3936 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR)
3937 goto fail_unlock;
3938
3939 ret = __rbd_register_watch(rbd_dev);
3940 if (ret) {
3941 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
3942 if (ret != -EBLACKLISTED)
3943 queue_delayed_work(rbd_dev->task_wq,
3944 &rbd_dev->watch_dwork,
3945 RBD_RETRY_DELAY);
3946 goto fail_unlock;
3947 }
3948
3949 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3950 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3951 mutex_unlock(&rbd_dev->watch_mutex);
3952
3953 ret = rbd_dev_refresh(rbd_dev);
3954 if (ret)
3955 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
3956
Ilya Dryomoved95b212016-08-12 16:40:02 +02003957 if (was_lock_owner) {
3958 ret = rbd_try_lock(rbd_dev);
3959 if (ret)
3960 rbd_warn(rbd_dev, "reregisteration lock failed: %d",
3961 ret);
3962 }
3963
3964 up_write(&rbd_dev->lock_rwsem);
3965 wake_requests(rbd_dev, true);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003966 return;
3967
3968fail_unlock:
3969 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003970 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003971}
3972
Alex Elder36be9a72013-01-19 00:30:28 -06003973/*
Alex Elderf40eb342013-04-25 15:09:42 -05003974 * Synchronous osd object method call. Returns the number of bytes
3975 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003976 */
3977static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3978 const char *object_name,
3979 const char *class_name,
3980 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003981 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003982 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003983 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003984 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003985{
3986 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06003987 struct page **pages;
3988 u32 page_count;
3989 int ret;
3990
3991 /*
Alex Elder6010a452013-04-05 01:27:11 -05003992 * Method calls are ultimately read operations. The result
3993 * should placed into the inbound buffer provided. They
3994 * also supply outbound data--parameters for the object
3995 * method. Currently if this is present it will be a
3996 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003997 */
Alex Elder57385b52013-04-21 12:14:45 -05003998 page_count = (u32)calc_pages_for(0, inbound_size);
Alex Elder36be9a72013-01-19 00:30:28 -06003999 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
4000 if (IS_ERR(pages))
4001 return PTR_ERR(pages);
4002
4003 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05004004 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06004005 OBJ_REQUEST_PAGES);
4006 if (!obj_request)
4007 goto out;
4008
4009 obj_request->pages = pages;
4010 obj_request->page_count = page_count;
4011
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004012 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02004013 obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06004014 if (!obj_request->osd_req)
4015 goto out;
4016
Alex Elderc99d2d42013-04-05 01:27:11 -05004017 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05004018 class_name, method_name);
4019 if (outbound_size) {
4020 struct ceph_pagelist *pagelist;
4021
4022 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
4023 if (!pagelist)
4024 goto out;
4025
4026 ceph_pagelist_init(pagelist);
4027 ceph_pagelist_append(pagelist, outbound, outbound_size);
4028 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
4029 pagelist);
4030 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05004031 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
4032 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05004033 0, false, false);
Alex Elder430c28c2013-04-03 21:32:51 -05004034
Ilya Dryomov980917f2016-09-12 18:59:42 +02004035 rbd_obj_request_submit(obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06004036 ret = rbd_obj_request_wait(obj_request);
4037 if (ret)
4038 goto out;
4039
4040 ret = obj_request->result;
4041 if (ret < 0)
4042 goto out;
Alex Elder57385b52013-04-21 12:14:45 -05004043
4044 rbd_assert(obj_request->xferred < (u64)INT_MAX);
4045 ret = (int)obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06004046 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06004047out:
4048 if (obj_request)
4049 rbd_obj_request_put(obj_request);
4050 else
4051 ceph_release_page_vector(pages, page_count);
4052
4053 return ret;
4054}
4055
Ilya Dryomoved95b212016-08-12 16:40:02 +02004056/*
4057 * lock_rwsem must be held for read
4058 */
4059static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
4060{
4061 DEFINE_WAIT(wait);
4062
4063 do {
4064 /*
4065 * Note the use of mod_delayed_work() in rbd_acquire_lock()
4066 * and cancel_delayed_work() in wake_requests().
4067 */
4068 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
4069 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4070 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
4071 TASK_UNINTERRUPTIBLE);
4072 up_read(&rbd_dev->lock_rwsem);
4073 schedule();
4074 down_read(&rbd_dev->lock_rwsem);
4075 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
4076 finish_wait(&rbd_dev->lock_waitq, &wait);
4077}
4078
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004079static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004080{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004081 struct request *rq = blk_mq_rq_from_pdu(work);
4082 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004083 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07004084 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004085 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4086 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004087 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07004088 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02004089 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004090 int result;
4091
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004092 if (rq->cmd_type != REQ_TYPE_FS) {
4093 dout("%s: non-fs request type %d\n", __func__,
4094 (int) rq->cmd_type);
4095 result = -EIO;
4096 goto err;
4097 }
4098
Mike Christiec2df40d2016-06-05 14:32:17 -05004099 if (req_op(rq) == REQ_OP_DISCARD)
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004100 op_type = OBJ_OP_DISCARD;
Mike Christiec2df40d2016-06-05 14:32:17 -05004101 else if (req_op(rq) == REQ_OP_WRITE)
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004102 op_type = OBJ_OP_WRITE;
4103 else
4104 op_type = OBJ_OP_READ;
4105
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004106 /* Ignore/skip any zero-length requests */
4107
4108 if (!length) {
4109 dout("%s: zero-length request\n", __func__);
4110 result = 0;
4111 goto err_rq;
4112 }
4113
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004114 /* Only reads are allowed to a read-only device */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004115
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004116 if (op_type != OBJ_OP_READ) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004117 if (rbd_dev->mapping.read_only) {
4118 result = -EROFS;
4119 goto err_rq;
4120 }
4121 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
4122 }
4123
4124 /*
4125 * Quit early if the mapped snapshot no longer exists. It's
4126 * still possible the snapshot will have disappeared by the
4127 * time our request arrives at the osd, but there's no sense in
4128 * sending it if we already know.
4129 */
4130 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4131 dout("request for non-existent snapshot");
4132 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4133 result = -ENXIO;
4134 goto err_rq;
4135 }
4136
4137 if (offset && length > U64_MAX - offset + 1) {
4138 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4139 length);
4140 result = -EINVAL;
4141 goto err_rq; /* Shouldn't happen */
4142 }
4143
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004144 blk_mq_start_request(rq);
4145
Josh Durgin4e752f02014-04-08 11:12:11 -07004146 down_read(&rbd_dev->header_rwsem);
4147 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004148 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07004149 snapc = rbd_dev->header.snapc;
4150 ceph_get_snap_context(snapc);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004151 must_be_locked = rbd_is_lock_supported(rbd_dev);
Ilya Dryomov80de1912016-09-20 14:23:17 +02004152 } else {
4153 must_be_locked = rbd_dev->opts->lock_on_read &&
4154 rbd_is_lock_supported(rbd_dev);
Josh Durgin4e752f02014-04-08 11:12:11 -07004155 }
4156 up_read(&rbd_dev->header_rwsem);
4157
4158 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004159 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07004160 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004161 result = -EIO;
4162 goto err_rq;
4163 }
4164
Ilya Dryomoved95b212016-08-12 16:40:02 +02004165 if (must_be_locked) {
4166 down_read(&rbd_dev->lock_rwsem);
4167 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4168 rbd_wait_state_locked(rbd_dev);
4169 }
4170
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004171 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07004172 snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004173 if (!img_request) {
4174 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004175 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004176 }
4177 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01004178 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004179
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004180 if (op_type == OBJ_OP_DISCARD)
4181 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
4182 NULL);
4183 else
4184 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
4185 rq->bio);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004186 if (result)
4187 goto err_img_request;
4188
4189 result = rbd_img_request_submit(img_request);
4190 if (result)
4191 goto err_img_request;
4192
Ilya Dryomoved95b212016-08-12 16:40:02 +02004193 if (must_be_locked)
4194 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004195 return;
4196
4197err_img_request:
4198 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004199err_unlock:
4200 if (must_be_locked)
4201 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004202err_rq:
4203 if (result)
4204 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004205 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01004206 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004207err:
4208 blk_mq_end_request(rq, result);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004209}
4210
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004211static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
4212 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004213{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004214 struct request *rq = bd->rq;
4215 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004216
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004217 queue_work(rbd_wq, work);
4218 return BLK_MQ_RQ_QUEUE_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06004219}
4220
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004221static void rbd_free_disk(struct rbd_device *rbd_dev)
4222{
4223 struct gendisk *disk = rbd_dev->disk;
4224
4225 if (!disk)
4226 return;
4227
Alex Eldera0cab922013-04-25 23:15:08 -05004228 rbd_dev->disk = NULL;
4229 if (disk->flags & GENHD_FL_UP) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004230 del_gendisk(disk);
Alex Eldera0cab922013-04-25 23:15:08 -05004231 if (disk->queue)
4232 blk_cleanup_queue(disk->queue);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004233 blk_mq_free_tag_set(&rbd_dev->tag_set);
Alex Eldera0cab922013-04-25 23:15:08 -05004234 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004235 put_disk(disk);
4236}
4237
Alex Elder788e2df2013-01-17 12:25:27 -06004238static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4239 const char *object_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05004240 u64 offset, u64 length, void *buf)
Alex Elder788e2df2013-01-17 12:25:27 -06004241
4242{
Alex Elder788e2df2013-01-17 12:25:27 -06004243 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06004244 struct page **pages = NULL;
4245 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06004246 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06004247 int ret;
4248
4249 page_count = (u32) calc_pages_for(offset, length);
4250 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
4251 if (IS_ERR(pages))
Jan Karaa8d42052014-10-22 09:17:24 +02004252 return PTR_ERR(pages);
Alex Elder788e2df2013-01-17 12:25:27 -06004253
4254 ret = -ENOMEM;
4255 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06004256 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06004257 if (!obj_request)
4258 goto out;
4259
4260 obj_request->pages = pages;
4261 obj_request->page_count = page_count;
4262
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004263 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02004264 obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06004265 if (!obj_request->osd_req)
4266 goto out;
4267
Alex Elderc99d2d42013-04-05 01:27:11 -05004268 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
4269 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05004270 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05004271 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05004272 obj_request->length,
4273 obj_request->offset & ~PAGE_MASK,
4274 false, false);
Alex Elder430c28c2013-04-03 21:32:51 -05004275
Ilya Dryomov980917f2016-09-12 18:59:42 +02004276 rbd_obj_request_submit(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06004277 ret = rbd_obj_request_wait(obj_request);
4278 if (ret)
4279 goto out;
4280
4281 ret = obj_request->result;
4282 if (ret < 0)
4283 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06004284
4285 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
4286 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06004287 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder7097f8d2013-04-30 00:44:33 -05004288 rbd_assert(size <= (size_t)INT_MAX);
4289 ret = (int)size;
Alex Elder788e2df2013-01-17 12:25:27 -06004290out:
4291 if (obj_request)
4292 rbd_obj_request_put(obj_request);
4293 else
4294 ceph_release_page_vector(pages, page_count);
4295
4296 return ret;
4297}
4298
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004299/*
Alex Elder662518b2013-05-06 09:51:29 -05004300 * Read the complete header for the given rbd device. On successful
4301 * return, the rbd_dev->header field will contain up-to-date
4302 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05004303 */
Alex Elder99a41eb2013-05-06 09:51:30 -05004304static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05004305{
4306 struct rbd_image_header_ondisk *ondisk = NULL;
4307 u32 snap_count = 0;
4308 u64 names_size = 0;
4309 u32 want_count;
4310 int ret;
4311
4312 /*
4313 * The complete header will include an array of its 64-bit
4314 * snapshot ids, followed by the names of those snapshots as
4315 * a contiguous block of NUL-terminated strings. Note that
4316 * the number of snapshots could change by the time we read
4317 * it in, in which case we re-read it.
4318 */
4319 do {
4320 size_t size;
4321
4322 kfree(ondisk);
4323
4324 size = sizeof (*ondisk);
4325 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4326 size += names_size;
4327 ondisk = kmalloc(size, GFP_KERNEL);
4328 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05004329 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05004330
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004331 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
Alex Elder7097f8d2013-04-30 00:44:33 -05004332 0, size, ondisk);
Alex Elder4156d992012-08-02 11:29:46 -05004333 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05004334 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004335 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05004336 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004337 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4338 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05004339 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004340 }
4341 if (!rbd_dev_ondisk_valid(ondisk)) {
4342 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004343 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05004344 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004345 }
4346
4347 names_size = le64_to_cpu(ondisk->snap_names_len);
4348 want_count = snap_count;
4349 snap_count = le32_to_cpu(ondisk->snap_count);
4350 } while (snap_count != want_count);
4351
Alex Elder662518b2013-05-06 09:51:29 -05004352 ret = rbd_header_from_disk(rbd_dev, ondisk);
4353out:
Alex Elder4156d992012-08-02 11:29:46 -05004354 kfree(ondisk);
4355
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004356 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004357}
4358
Alex Elder15228ed2013-05-01 12:43:03 -05004359/*
4360 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
4361 * has disappeared from the (just updated) snapshot context.
4362 */
4363static void rbd_exists_validate(struct rbd_device *rbd_dev)
4364{
4365 u64 snap_id;
4366
4367 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
4368 return;
4369
4370 snap_id = rbd_dev->spec->snap_id;
4371 if (snap_id == CEPH_NOSNAP)
4372 return;
4373
4374 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
4375 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4376}
4377
Josh Durgin98752012013-08-29 17:26:31 -07004378static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4379{
4380 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07004381
4382 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02004383 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4384 * try to update its size. If REMOVING is set, updating size
4385 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07004386 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02004387 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4388 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07004389 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4390 dout("setting size to %llu sectors", (unsigned long long)size);
4391 set_capacity(rbd_dev->disk, size);
4392 revalidate_disk(rbd_dev->disk);
4393 }
4394}
4395
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004396static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05004397{
Alex Eldere627db02013-05-06 07:40:30 -05004398 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05004399 int ret;
4400
Alex Eldercfbf6372013-05-31 17:40:45 -05004401 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004402 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004403
4404 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004405 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004406 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004407
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004408 /*
4409 * If there is a parent, see if it has disappeared due to the
4410 * mapped image getting flattened.
4411 */
4412 if (rbd_dev->parent) {
4413 ret = rbd_dev_v2_parent_info(rbd_dev);
4414 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004415 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004416 }
4417
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004418 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004419 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004420 } else {
4421 /* validate mapped snapshot's EXISTS flag */
4422 rbd_exists_validate(rbd_dev);
4423 }
Alex Elder15228ed2013-05-01 12:43:03 -05004424
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004425out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004426 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004427 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004428 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004429
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004430 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004431}
4432
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004433static int rbd_init_request(void *data, struct request *rq,
4434 unsigned int hctx_idx, unsigned int request_idx,
4435 unsigned int numa_node)
4436{
4437 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4438
4439 INIT_WORK(work, rbd_queue_workfn);
4440 return 0;
4441}
4442
4443static struct blk_mq_ops rbd_mq_ops = {
4444 .queue_rq = rbd_queue_rq,
4445 .map_queue = blk_mq_map_queue,
4446 .init_request = rbd_init_request,
4447};
4448
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004449static int rbd_init_disk(struct rbd_device *rbd_dev)
4450{
4451 struct gendisk *disk;
4452 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06004453 u64 segment_size;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004454 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004455
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004456 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004457 disk = alloc_disk(single_major ?
4458 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4459 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004460 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004461 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004462
Alex Elderf0f8cef2012-01-29 13:57:44 -06004463 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004464 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004465 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004466 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004467 if (single_major)
4468 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004469 disk->fops = &rbd_bd_ops;
4470 disk->private_data = rbd_dev;
4471
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004472 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4473 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004474 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004475 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004476 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004477 rbd_dev->tag_set.nr_hw_queues = 1;
4478 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4479
4480 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4481 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004482 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004483
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004484 q = blk_mq_init_queue(&rbd_dev->tag_set);
4485 if (IS_ERR(q)) {
4486 err = PTR_ERR(q);
4487 goto out_tag_set;
4488 }
4489
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004490 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
4491 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004492
Josh Durgin029bcbd2011-07-22 11:35:23 -07004493 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06004494 segment_size = rbd_obj_bytes(&rbd_dev->header);
4495 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004496 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomovd3834fe2015-06-12 19:19:02 +03004497 blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
Alex Elder593a9e72012-02-07 12:03:37 -06004498 blk_queue_max_segment_size(q, segment_size);
4499 blk_queue_io_min(q, segment_size);
4500 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004501
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004502 /* enable the discard support */
4503 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
4504 q->limits.discard_granularity = segment_size;
4505 q->limits.discard_alignment = segment_size;
Jens Axboe2bb4cd52015-07-14 08:15:12 -06004506 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
Josh Durginb76f8232014-04-07 16:52:03 -07004507 q->limits.discard_zeroes_data = 1;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004508
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004509 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
4510 q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
4511
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004512 disk->queue = q;
4513
4514 q->queuedata = rbd_dev;
4515
4516 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004517
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004518 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004519out_tag_set:
4520 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004521out_disk:
4522 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004523 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004524}
4525
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004526/*
4527 sysfs
4528*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004529
Alex Elder593a9e72012-02-07 12:03:37 -06004530static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4531{
4532 return container_of(dev, struct rbd_device, dev);
4533}
4534
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004535static ssize_t rbd_size_show(struct device *dev,
4536 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004537{
Alex Elder593a9e72012-02-07 12:03:37 -06004538 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004539
Alex Elderfc71d832013-04-26 15:44:36 -05004540 return sprintf(buf, "%llu\n",
4541 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004542}
4543
Alex Elder34b13182012-07-13 20:35:12 -05004544/*
4545 * Note this shows the features for whatever's mapped, which is not
4546 * necessarily the base image.
4547 */
4548static ssize_t rbd_features_show(struct device *dev,
4549 struct device_attribute *attr, char *buf)
4550{
4551 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4552
4553 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004554 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004555}
4556
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004557static ssize_t rbd_major_show(struct device *dev,
4558 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004559{
Alex Elder593a9e72012-02-07 12:03:37 -06004560 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004561
Alex Elderfc71d832013-04-26 15:44:36 -05004562 if (rbd_dev->major)
4563 return sprintf(buf, "%d\n", rbd_dev->major);
4564
4565 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004566}
Alex Elderfc71d832013-04-26 15:44:36 -05004567
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004568static ssize_t rbd_minor_show(struct device *dev,
4569 struct device_attribute *attr, char *buf)
4570{
4571 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4572
4573 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004574}
4575
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004576static ssize_t rbd_client_addr_show(struct device *dev,
4577 struct device_attribute *attr, char *buf)
4578{
4579 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4580 struct ceph_entity_addr *client_addr =
4581 ceph_client_addr(rbd_dev->rbd_client->client);
4582
4583 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4584 le32_to_cpu(client_addr->nonce));
4585}
4586
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004587static ssize_t rbd_client_id_show(struct device *dev,
4588 struct device_attribute *attr, char *buf)
4589{
Alex Elder593a9e72012-02-07 12:03:37 -06004590 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004591
Alex Elder1dbb4392012-01-24 10:08:37 -06004592 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004593 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004594}
4595
Mike Christie267fb902016-08-18 18:38:43 +02004596static ssize_t rbd_cluster_fsid_show(struct device *dev,
4597 struct device_attribute *attr, char *buf)
4598{
4599 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4600
4601 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4602}
4603
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004604static ssize_t rbd_config_info_show(struct device *dev,
4605 struct device_attribute *attr, char *buf)
4606{
4607 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4608
4609 return sprintf(buf, "%s\n", rbd_dev->config_info);
4610}
4611
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004612static ssize_t rbd_pool_show(struct device *dev,
4613 struct device_attribute *attr, char *buf)
4614{
Alex Elder593a9e72012-02-07 12:03:37 -06004615 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004616
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004617 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004618}
4619
Alex Elder9bb2f332012-07-12 10:46:35 -05004620static ssize_t rbd_pool_id_show(struct device *dev,
4621 struct device_attribute *attr, char *buf)
4622{
4623 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4624
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004625 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004626 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004627}
4628
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004629static ssize_t rbd_name_show(struct device *dev,
4630 struct device_attribute *attr, char *buf)
4631{
Alex Elder593a9e72012-02-07 12:03:37 -06004632 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004633
Alex Eldera92ffdf2012-10-30 19:40:33 -05004634 if (rbd_dev->spec->image_name)
4635 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4636
4637 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004638}
4639
Alex Elder589d30e2012-07-10 20:30:11 -05004640static ssize_t rbd_image_id_show(struct device *dev,
4641 struct device_attribute *attr, char *buf)
4642{
4643 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4644
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004645 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004646}
4647
Alex Elder34b13182012-07-13 20:35:12 -05004648/*
4649 * Shows the name of the currently-mapped snapshot (or
4650 * RBD_SNAP_HEAD_NAME for the base image).
4651 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004652static ssize_t rbd_snap_show(struct device *dev,
4653 struct device_attribute *attr,
4654 char *buf)
4655{
Alex Elder593a9e72012-02-07 12:03:37 -06004656 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004657
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004658 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004659}
4660
Mike Christie92a58672016-08-18 18:38:44 +02004661static ssize_t rbd_snap_id_show(struct device *dev,
4662 struct device_attribute *attr, char *buf)
4663{
4664 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4665
4666 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4667}
4668
Alex Elder86b00e02012-10-25 23:34:42 -05004669/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004670 * For a v2 image, shows the chain of parent images, separated by empty
4671 * lines. For v1 images or if there is no parent, shows "(no parent
4672 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004673 */
4674static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004675 struct device_attribute *attr,
4676 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004677{
4678 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004679 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004680
Ilya Dryomovff961282014-07-22 21:53:07 +04004681 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004682 return sprintf(buf, "(no parent image)\n");
4683
Ilya Dryomovff961282014-07-22 21:53:07 +04004684 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4685 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004686
Ilya Dryomovff961282014-07-22 21:53:07 +04004687 count += sprintf(&buf[count], "%s"
4688 "pool_id %llu\npool_name %s\n"
4689 "image_id %s\nimage_name %s\n"
4690 "snap_id %llu\nsnap_name %s\n"
4691 "overlap %llu\n",
4692 !count ? "" : "\n", /* first? */
4693 spec->pool_id, spec->pool_name,
4694 spec->image_id, spec->image_name ?: "(unknown)",
4695 spec->snap_id, spec->snap_name,
4696 rbd_dev->parent_overlap);
4697 }
Alex Elder86b00e02012-10-25 23:34:42 -05004698
Ilya Dryomovff961282014-07-22 21:53:07 +04004699 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004700}
4701
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004702static ssize_t rbd_image_refresh(struct device *dev,
4703 struct device_attribute *attr,
4704 const char *buf,
4705 size_t size)
4706{
Alex Elder593a9e72012-02-07 12:03:37 -06004707 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004708 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004709
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004710 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004711 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004712 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004713
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004714 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004715}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004716
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004717static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05004718static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004719static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004720static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004721static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004722static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
Mike Christie267fb902016-08-18 18:38:43 +02004723static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004724static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004725static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05004726static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004727static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05004728static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004729static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4730static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Mike Christie92a58672016-08-18 18:38:44 +02004731static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05004732static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004733
4734static struct attribute *rbd_attrs[] = {
4735 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004736 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004737 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004738 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004739 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004740 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004741 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004742 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004743 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004744 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004745 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004746 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004747 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004748 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004749 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004750 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004751 NULL
4752};
4753
4754static struct attribute_group rbd_attr_group = {
4755 .attrs = rbd_attrs,
4756};
4757
4758static const struct attribute_group *rbd_attr_groups[] = {
4759 &rbd_attr_group,
4760 NULL
4761};
4762
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004763static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004764
4765static struct device_type rbd_device_type = {
4766 .name = "rbd",
4767 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004768 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004769};
4770
Alex Elder8b8fb992012-10-26 17:25:24 -05004771static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4772{
4773 kref_get(&spec->kref);
4774
4775 return spec;
4776}
4777
4778static void rbd_spec_free(struct kref *kref);
4779static void rbd_spec_put(struct rbd_spec *spec)
4780{
4781 if (spec)
4782 kref_put(&spec->kref, rbd_spec_free);
4783}
4784
4785static struct rbd_spec *rbd_spec_alloc(void)
4786{
4787 struct rbd_spec *spec;
4788
4789 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4790 if (!spec)
4791 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004792
4793 spec->pool_id = CEPH_NOPOOL;
4794 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004795 kref_init(&spec->kref);
4796
Alex Elder8b8fb992012-10-26 17:25:24 -05004797 return spec;
4798}
4799
4800static void rbd_spec_free(struct kref *kref)
4801{
4802 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4803
4804 kfree(spec->pool_name);
4805 kfree(spec->image_id);
4806 kfree(spec->image_name);
4807 kfree(spec->snap_name);
4808 kfree(spec);
4809}
4810
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004811static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004812{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004813 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004814 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomov99d16942016-08-12 16:11:41 +02004815
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004816 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004817 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004818 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004819
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004820 rbd_put_client(rbd_dev->rbd_client);
4821 rbd_spec_put(rbd_dev->spec);
4822 kfree(rbd_dev->opts);
4823 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004824}
4825
4826static void rbd_dev_release(struct device *dev)
4827{
4828 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4829 bool need_put = !!rbd_dev->opts;
4830
4831 if (need_put) {
4832 destroy_workqueue(rbd_dev->task_wq);
4833 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4834 }
4835
4836 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004837
4838 /*
4839 * This is racy, but way better than putting module outside of
4840 * the release callback. The race window is pretty small, so
4841 * doing something similar to dm (dm-builtin.c) is overkill.
4842 */
4843 if (need_put)
4844 module_put(THIS_MODULE);
4845}
4846
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004847static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4848 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004849{
4850 struct rbd_device *rbd_dev;
4851
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004852 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004853 if (!rbd_dev)
4854 return NULL;
4855
4856 spin_lock_init(&rbd_dev->lock);
4857 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004858 init_rwsem(&rbd_dev->header_rwsem);
4859
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004860 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov922dab62016-05-26 01:15:02 +02004861 ceph_oloc_init(&rbd_dev->header_oloc);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004862
Ilya Dryomov99d16942016-08-12 16:11:41 +02004863 mutex_init(&rbd_dev->watch_mutex);
4864 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4865 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4866
Ilya Dryomoved95b212016-08-12 16:40:02 +02004867 init_rwsem(&rbd_dev->lock_rwsem);
4868 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4869 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4870 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4871 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4872 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4873 init_waitqueue_head(&rbd_dev->lock_waitq);
4874
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004875 rbd_dev->dev.bus = &rbd_bus_type;
4876 rbd_dev->dev.type = &rbd_device_type;
4877 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004878 device_initialize(&rbd_dev->dev);
4879
Alex Elderc53d5892012-10-25 23:34:42 -05004880 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004881 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004882
Yan, Zheng76271512016-02-03 21:24:49 +08004883 rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
4884 rbd_dev->layout.stripe_count = 1;
4885 rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
4886 rbd_dev->layout.pool_id = spec->pool_id;
Yan, Zheng30c156d2016-02-14 11:24:31 +08004887 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
Alex Elder0903e872012-11-14 12:25:19 -06004888
Alex Elderc53d5892012-10-25 23:34:42 -05004889 return rbd_dev;
4890}
4891
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004892/*
4893 * Create a mapping rbd_dev.
4894 */
4895static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4896 struct rbd_spec *spec,
4897 struct rbd_options *opts)
4898{
4899 struct rbd_device *rbd_dev;
4900
4901 rbd_dev = __rbd_dev_create(rbdc, spec);
4902 if (!rbd_dev)
4903 return NULL;
4904
4905 rbd_dev->opts = opts;
4906
4907 /* get an id and fill in device name */
4908 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4909 minor_to_rbd_dev_id(1 << MINORBITS),
4910 GFP_KERNEL);
4911 if (rbd_dev->dev_id < 0)
4912 goto fail_rbd_dev;
4913
4914 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4915 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4916 rbd_dev->name);
4917 if (!rbd_dev->task_wq)
4918 goto fail_dev_id;
4919
4920 /* we have a ref from do_rbd_add() */
4921 __module_get(THIS_MODULE);
4922
4923 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4924 return rbd_dev;
4925
4926fail_dev_id:
4927 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4928fail_rbd_dev:
4929 rbd_dev_free(rbd_dev);
4930 return NULL;
4931}
4932
Alex Elderc53d5892012-10-25 23:34:42 -05004933static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4934{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004935 if (rbd_dev)
4936 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004937}
4938
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004939/*
Alex Elder9d475de2012-07-03 16:01:19 -05004940 * Get the size and object order for an image snapshot, or if
4941 * snap_id is CEPH_NOSNAP, gets this information for the base
4942 * image.
4943 */
4944static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4945 u8 *order, u64 *snap_size)
4946{
4947 __le64 snapid = cpu_to_le64(snap_id);
4948 int ret;
4949 struct {
4950 u8 order;
4951 __le64 size;
4952 } __attribute__ ((packed)) size_buf = { 0 };
4953
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004954 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
Alex Elder9d475de2012-07-03 16:01:19 -05004955 "rbd", "get_size",
Alex Elder41579762013-04-21 12:14:45 -05004956 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05004957 &size_buf, sizeof (size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004958 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004959 if (ret < 0)
4960 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004961 if (ret < sizeof (size_buf))
4962 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004963
Josh Durginc3545572013-08-28 17:08:10 -07004964 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004965 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004966 dout(" order %u", (unsigned int)*order);
4967 }
Alex Elder9d475de2012-07-03 16:01:19 -05004968 *snap_size = le64_to_cpu(size_buf.size);
4969
Josh Durginc3545572013-08-28 17:08:10 -07004970 dout(" snap_id 0x%016llx snap_size = %llu\n",
4971 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004972 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004973
4974 return 0;
4975}
4976
4977static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4978{
4979 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4980 &rbd_dev->header.obj_order,
4981 &rbd_dev->header.image_size);
4982}
4983
Alex Elder1e130192012-07-03 16:01:19 -05004984static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4985{
4986 void *reply_buf;
4987 int ret;
4988 void *p;
4989
4990 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4991 if (!reply_buf)
4992 return -ENOMEM;
4993
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004994 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
Alex Elder41579762013-04-21 12:14:45 -05004995 "rbd", "get_object_prefix", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004996 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004997 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004998 if (ret < 0)
4999 goto out;
5000
5001 p = reply_buf;
5002 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05005003 p + ret, NULL, GFP_NOIO);
5004 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005005
5006 if (IS_ERR(rbd_dev->header.object_prefix)) {
5007 ret = PTR_ERR(rbd_dev->header.object_prefix);
5008 rbd_dev->header.object_prefix = NULL;
5009 } else {
5010 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
5011 }
Alex Elder1e130192012-07-03 16:01:19 -05005012out:
5013 kfree(reply_buf);
5014
5015 return ret;
5016}
5017
Alex Elderb1b54022012-07-03 16:01:19 -05005018static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5019 u64 *snap_features)
5020{
5021 __le64 snapid = cpu_to_le64(snap_id);
5022 struct {
5023 __le64 features;
5024 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05005025 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005026 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05005027 int ret;
5028
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005029 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
Alex Elderb1b54022012-07-03 16:01:19 -05005030 "rbd", "get_features",
Alex Elder41579762013-04-21 12:14:45 -05005031 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05005032 &features_buf, sizeof (features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06005033 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05005034 if (ret < 0)
5035 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05005036 if (ret < sizeof (features_buf))
5037 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07005038
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005039 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5040 if (unsup) {
5041 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5042 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05005043 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005044 }
Alex Elderd8891402012-10-09 13:50:17 -07005045
Alex Elderb1b54022012-07-03 16:01:19 -05005046 *snap_features = le64_to_cpu(features_buf.features);
5047
5048 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05005049 (unsigned long long)snap_id,
5050 (unsigned long long)*snap_features,
5051 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05005052
5053 return 0;
5054}
5055
5056static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5057{
5058 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5059 &rbd_dev->header.features);
5060}
5061
Alex Elder86b00e02012-10-25 23:34:42 -05005062static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5063{
5064 struct rbd_spec *parent_spec;
5065 size_t size;
5066 void *reply_buf = NULL;
5067 __le64 snapid;
5068 void *p;
5069 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05005070 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05005071 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005072 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05005073 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05005074 int ret;
5075
5076 parent_spec = rbd_spec_alloc();
5077 if (!parent_spec)
5078 return -ENOMEM;
5079
5080 size = sizeof (__le64) + /* pool_id */
5081 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
5082 sizeof (__le64) + /* snap_id */
5083 sizeof (__le64); /* overlap */
5084 reply_buf = kmalloc(size, GFP_KERNEL);
5085 if (!reply_buf) {
5086 ret = -ENOMEM;
5087 goto out_err;
5088 }
5089
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04005090 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005091 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
Alex Elder86b00e02012-10-25 23:34:42 -05005092 "rbd", "get_parent",
Alex Elder41579762013-04-21 12:14:45 -05005093 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05005094 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005095 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05005096 if (ret < 0)
5097 goto out_err;
5098
Alex Elder86b00e02012-10-25 23:34:42 -05005099 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005100 end = reply_buf + ret;
5101 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05005102 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05005103 if (pool_id == CEPH_NOPOOL) {
5104 /*
5105 * Either the parent never existed, or we have
5106 * record of it but the image got flattened so it no
5107 * longer has a parent. When the parent of a
5108 * layered image disappears we immediately set the
5109 * overlap to 0. The effect of this is that all new
5110 * requests will be treated as if the image had no
5111 * parent.
5112 */
5113 if (rbd_dev->parent_overlap) {
5114 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05005115 rbd_dev_parent_put(rbd_dev);
5116 pr_info("%s: clone image has been flattened\n",
5117 rbd_dev->disk->disk_name);
5118 }
5119
Alex Elder86b00e02012-10-25 23:34:42 -05005120 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05005121 }
Alex Elder86b00e02012-10-25 23:34:42 -05005122
Alex Elder0903e872012-11-14 12:25:19 -06005123 /* The ceph file layout needs to fit pool id in 32 bits */
5124
5125 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05005126 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04005127 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05005128 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05005129 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005130 }
Alex Elder0903e872012-11-14 12:25:19 -06005131
Alex Elder979ed482012-11-01 08:39:26 -05005132 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05005133 if (IS_ERR(image_id)) {
5134 ret = PTR_ERR(image_id);
5135 goto out_err;
5136 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005137 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05005138 ceph_decode_64_safe(&p, end, overlap, out_err);
5139
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005140 /*
5141 * The parent won't change (except when the clone is
5142 * flattened, already handled that). So we only need to
5143 * record the parent spec we have not already done so.
5144 */
5145 if (!rbd_dev->parent_spec) {
5146 parent_spec->pool_id = pool_id;
5147 parent_spec->image_id = image_id;
5148 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05005149 rbd_dev->parent_spec = parent_spec;
5150 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04005151 } else {
5152 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005153 }
5154
5155 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005156 * We always update the parent overlap. If it's zero we issue
5157 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005158 */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005159 if (!overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005160 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005161 /* refresh, careful to warn just once */
5162 if (rbd_dev->parent_overlap)
5163 rbd_warn(rbd_dev,
5164 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005165 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005166 /* initial probe */
5167 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005168 }
Alex Elder70cf49c2013-05-06 17:40:33 -05005169 }
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005170 rbd_dev->parent_overlap = overlap;
5171
Alex Elder86b00e02012-10-25 23:34:42 -05005172out:
5173 ret = 0;
5174out_err:
5175 kfree(reply_buf);
5176 rbd_spec_put(parent_spec);
5177
5178 return ret;
5179}
5180
Alex Eldercc070d52013-04-21 12:14:45 -05005181static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5182{
5183 struct {
5184 __le64 stripe_unit;
5185 __le64 stripe_count;
5186 } __attribute__ ((packed)) striping_info_buf = { 0 };
5187 size_t size = sizeof (striping_info_buf);
5188 void *p;
5189 u64 obj_size;
5190 u64 stripe_unit;
5191 u64 stripe_count;
5192 int ret;
5193
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005194 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
Alex Eldercc070d52013-04-21 12:14:45 -05005195 "rbd", "get_stripe_unit_count", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05005196 (char *)&striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05005197 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5198 if (ret < 0)
5199 return ret;
5200 if (ret < size)
5201 return -ERANGE;
5202
5203 /*
5204 * We don't actually support the "fancy striping" feature
5205 * (STRIPINGV2) yet, but if the striping sizes are the
5206 * defaults the behavior is the same as before. So find
5207 * out, and only fail if the image has non-default values.
5208 */
5209 ret = -EINVAL;
5210 obj_size = (u64)1 << rbd_dev->header.obj_order;
5211 p = &striping_info_buf;
5212 stripe_unit = ceph_decode_64(&p);
5213 if (stripe_unit != obj_size) {
5214 rbd_warn(rbd_dev, "unsupported stripe unit "
5215 "(got %llu want %llu)",
5216 stripe_unit, obj_size);
5217 return -EINVAL;
5218 }
5219 stripe_count = ceph_decode_64(&p);
5220 if (stripe_count != 1) {
5221 rbd_warn(rbd_dev, "unsupported stripe count "
5222 "(got %llu want 1)", stripe_count);
5223 return -EINVAL;
5224 }
Alex Elder500d0c02013-04-26 09:43:47 -05005225 rbd_dev->header.stripe_unit = stripe_unit;
5226 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05005227
5228 return 0;
5229}
5230
Alex Elder9e15b772012-10-30 19:40:33 -05005231static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5232{
5233 size_t image_id_size;
5234 char *image_id;
5235 void *p;
5236 void *end;
5237 size_t size;
5238 void *reply_buf = NULL;
5239 size_t len = 0;
5240 char *image_name = NULL;
5241 int ret;
5242
5243 rbd_assert(!rbd_dev->spec->image_name);
5244
Alex Elder69e7a022012-11-01 08:39:26 -05005245 len = strlen(rbd_dev->spec->image_id);
5246 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05005247 image_id = kmalloc(image_id_size, GFP_KERNEL);
5248 if (!image_id)
5249 return NULL;
5250
5251 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05005252 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05005253 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05005254
5255 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5256 reply_buf = kmalloc(size, GFP_KERNEL);
5257 if (!reply_buf)
5258 goto out;
5259
Alex Elder36be9a72013-01-19 00:30:28 -06005260 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05005261 "rbd", "dir_get_name",
5262 image_id, image_id_size,
Alex Eldere2a58ee2013-04-30 00:44:33 -05005263 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005264 if (ret < 0)
5265 goto out;
5266 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005267 end = reply_buf + ret;
5268
Alex Elder9e15b772012-10-30 19:40:33 -05005269 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5270 if (IS_ERR(image_name))
5271 image_name = NULL;
5272 else
5273 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5274out:
5275 kfree(reply_buf);
5276 kfree(image_id);
5277
5278 return image_name;
5279}
5280
Alex Elder2ad3d712013-04-30 00:44:33 -05005281static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5282{
5283 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5284 const char *snap_name;
5285 u32 which = 0;
5286
5287 /* Skip over names until we find the one we are looking for */
5288
5289 snap_name = rbd_dev->header.snap_names;
5290 while (which < snapc->num_snaps) {
5291 if (!strcmp(name, snap_name))
5292 return snapc->snaps[which];
5293 snap_name += strlen(snap_name) + 1;
5294 which++;
5295 }
5296 return CEPH_NOSNAP;
5297}
5298
5299static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5300{
5301 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5302 u32 which;
5303 bool found = false;
5304 u64 snap_id;
5305
5306 for (which = 0; !found && which < snapc->num_snaps; which++) {
5307 const char *snap_name;
5308
5309 snap_id = snapc->snaps[which];
5310 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005311 if (IS_ERR(snap_name)) {
5312 /* ignore no-longer existing snapshots */
5313 if (PTR_ERR(snap_name) == -ENOENT)
5314 continue;
5315 else
5316 break;
5317 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005318 found = !strcmp(name, snap_name);
5319 kfree(snap_name);
5320 }
5321 return found ? snap_id : CEPH_NOSNAP;
5322}
5323
5324/*
5325 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5326 * no snapshot by that name is found, or if an error occurs.
5327 */
5328static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5329{
5330 if (rbd_dev->image_format == 1)
5331 return rbd_v1_snap_id_by_name(rbd_dev, name);
5332
5333 return rbd_v2_snap_id_by_name(rbd_dev, name);
5334}
5335
Alex Elder9e15b772012-10-30 19:40:33 -05005336/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005337 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005338 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005339static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5340{
5341 struct rbd_spec *spec = rbd_dev->spec;
5342
5343 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5344 rbd_assert(spec->image_id && spec->image_name);
5345 rbd_assert(spec->snap_name);
5346
5347 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5348 u64 snap_id;
5349
5350 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5351 if (snap_id == CEPH_NOSNAP)
5352 return -ENOENT;
5353
5354 spec->snap_id = snap_id;
5355 } else {
5356 spec->snap_id = CEPH_NOSNAP;
5357 }
5358
5359 return 0;
5360}
5361
5362/*
5363 * A parent image will have all ids but none of the names.
5364 *
5365 * All names in an rbd spec are dynamically allocated. It's OK if we
5366 * can't figure out the name for an image id.
5367 */
5368static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005369{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005370 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5371 struct rbd_spec *spec = rbd_dev->spec;
5372 const char *pool_name;
5373 const char *image_name;
5374 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005375 int ret;
5376
Ilya Dryomov04077592014-07-23 17:11:20 +04005377 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5378 rbd_assert(spec->image_id);
5379 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005380
Alex Elder2e9f7f12013-04-26 09:43:48 -05005381 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005382
Alex Elder2e9f7f12013-04-26 09:43:48 -05005383 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5384 if (!pool_name) {
5385 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005386 return -EIO;
5387 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005388 pool_name = kstrdup(pool_name, GFP_KERNEL);
5389 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005390 return -ENOMEM;
5391
5392 /* Fetch the image name; tolerate failure here */
5393
Alex Elder2e9f7f12013-04-26 09:43:48 -05005394 image_name = rbd_dev_image_name(rbd_dev);
5395 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005396 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005397
Ilya Dryomov04077592014-07-23 17:11:20 +04005398 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005399
Alex Elder2e9f7f12013-04-26 09:43:48 -05005400 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005401 if (IS_ERR(snap_name)) {
5402 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005403 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005404 }
5405
5406 spec->pool_name = pool_name;
5407 spec->image_name = image_name;
5408 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005409
5410 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005411
Alex Elder9e15b772012-10-30 19:40:33 -05005412out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005413 kfree(image_name);
5414 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005415 return ret;
5416}
5417
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005418static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005419{
5420 size_t size;
5421 int ret;
5422 void *reply_buf;
5423 void *p;
5424 void *end;
5425 u64 seq;
5426 u32 snap_count;
5427 struct ceph_snap_context *snapc;
5428 u32 i;
5429
5430 /*
5431 * We'll need room for the seq value (maximum snapshot id),
5432 * snapshot count, and array of that many snapshot ids.
5433 * For now we have a fixed upper limit on the number we're
5434 * prepared to receive.
5435 */
5436 size = sizeof (__le64) + sizeof (__le32) +
5437 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5438 reply_buf = kzalloc(size, GFP_KERNEL);
5439 if (!reply_buf)
5440 return -ENOMEM;
5441
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005442 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
Alex Elder41579762013-04-21 12:14:45 -05005443 "rbd", "get_snapcontext", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05005444 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005445 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005446 if (ret < 0)
5447 goto out;
5448
Alex Elder35d489f2012-07-03 16:01:19 -05005449 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005450 end = reply_buf + ret;
5451 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005452 ceph_decode_64_safe(&p, end, seq, out);
5453 ceph_decode_32_safe(&p, end, snap_count, out);
5454
5455 /*
5456 * Make sure the reported number of snapshot ids wouldn't go
5457 * beyond the end of our buffer. But before checking that,
5458 * make sure the computed size of the snapshot context we
5459 * allocate is representable in a size_t.
5460 */
5461 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5462 / sizeof (u64)) {
5463 ret = -EINVAL;
5464 goto out;
5465 }
5466 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5467 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005468 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005469
Alex Elder812164f82013-04-30 00:44:32 -05005470 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005471 if (!snapc) {
5472 ret = -ENOMEM;
5473 goto out;
5474 }
Alex Elder35d489f2012-07-03 16:01:19 -05005475 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005476 for (i = 0; i < snap_count; i++)
5477 snapc->snaps[i] = ceph_decode_64(&p);
5478
Alex Elder49ece552013-05-06 08:37:00 -05005479 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005480 rbd_dev->header.snapc = snapc;
5481
5482 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005483 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005484out:
5485 kfree(reply_buf);
5486
Alex Elder57385b52013-04-21 12:14:45 -05005487 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005488}
5489
Alex Elder54cac612013-04-30 00:44:33 -05005490static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5491 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005492{
5493 size_t size;
5494 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005495 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005496 int ret;
5497 void *p;
5498 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005499 char *snap_name;
5500
5501 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5502 reply_buf = kmalloc(size, GFP_KERNEL);
5503 if (!reply_buf)
5504 return ERR_PTR(-ENOMEM);
5505
Alex Elder54cac612013-04-30 00:44:33 -05005506 snapid = cpu_to_le64(snap_id);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005507 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005508 "rbd", "get_snapshot_name",
Alex Elder54cac612013-04-30 00:44:33 -05005509 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05005510 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005511 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005512 if (ret < 0) {
5513 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005514 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005515 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005516
5517 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005518 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005519 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005520 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005521 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005522
Alex Elderf40eb342013-04-25 15:09:42 -05005523 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005524 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005525out:
5526 kfree(reply_buf);
5527
Alex Elderf40eb342013-04-25 15:09:42 -05005528 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005529}
5530
Alex Elder2df3fac2013-05-06 09:51:30 -05005531static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005532{
Alex Elder2df3fac2013-05-06 09:51:30 -05005533 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005534 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005535
Josh Durgin1617e402013-06-12 14:43:10 -07005536 ret = rbd_dev_v2_image_size(rbd_dev);
5537 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005538 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005539
Alex Elder2df3fac2013-05-06 09:51:30 -05005540 if (first_time) {
5541 ret = rbd_dev_v2_header_onetime(rbd_dev);
5542 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005543 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005544 }
5545
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005546 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005547 if (ret && first_time) {
5548 kfree(rbd_dev->header.object_prefix);
5549 rbd_dev->header.object_prefix = NULL;
5550 }
Alex Elder117973f2012-08-31 17:29:55 -05005551
5552 return ret;
5553}
5554
Ilya Dryomova720ae02014-07-23 17:11:19 +04005555static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5556{
5557 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5558
5559 if (rbd_dev->image_format == 1)
5560 return rbd_dev_v1_header_info(rbd_dev);
5561
5562 return rbd_dev_v2_header_info(rbd_dev);
5563}
5564
Alex Elder1ddbe942012-01-29 13:57:44 -06005565/*
Alex Eldere28fff262012-02-02 08:13:30 -06005566 * Skips over white space at *buf, and updates *buf to point to the
5567 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005568 * the token (string of non-white space characters) found. Note
5569 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005570 */
5571static inline size_t next_token(const char **buf)
5572{
5573 /*
5574 * These are the characters that produce nonzero for
5575 * isspace() in the "C" and "POSIX" locales.
5576 */
5577 const char *spaces = " \f\n\r\t\v";
5578
5579 *buf += strspn(*buf, spaces); /* Find start of token */
5580
5581 return strcspn(*buf, spaces); /* Return token length */
5582}
5583
5584/*
Alex Elderea3352f2012-07-09 21:04:23 -05005585 * Finds the next token in *buf, dynamically allocates a buffer big
5586 * enough to hold a copy of it, and copies the token into the new
5587 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5588 * that a duplicate buffer is created even for a zero-length token.
5589 *
5590 * Returns a pointer to the newly-allocated duplicate, or a null
5591 * pointer if memory for the duplicate was not available. If
5592 * the lenp argument is a non-null pointer, the length of the token
5593 * (not including the '\0') is returned in *lenp.
5594 *
5595 * If successful, the *buf pointer will be updated to point beyond
5596 * the end of the found token.
5597 *
5598 * Note: uses GFP_KERNEL for allocation.
5599 */
5600static inline char *dup_token(const char **buf, size_t *lenp)
5601{
5602 char *dup;
5603 size_t len;
5604
5605 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005606 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005607 if (!dup)
5608 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005609 *(dup + len) = '\0';
5610 *buf += len;
5611
5612 if (lenp)
5613 *lenp = len;
5614
5615 return dup;
5616}
5617
5618/*
Alex Elder859c31d2012-10-25 23:34:42 -05005619 * Parse the options provided for an "rbd add" (i.e., rbd image
5620 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5621 * and the data written is passed here via a NUL-terminated buffer.
5622 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005623 *
Alex Elder859c31d2012-10-25 23:34:42 -05005624 * The information extracted from these options is recorded in
5625 * the other parameters which return dynamically-allocated
5626 * structures:
5627 * ceph_opts
5628 * The address of a pointer that will refer to a ceph options
5629 * structure. Caller must release the returned pointer using
5630 * ceph_destroy_options() when it is no longer needed.
5631 * rbd_opts
5632 * Address of an rbd options pointer. Fully initialized by
5633 * this function; caller must release with kfree().
5634 * spec
5635 * Address of an rbd image specification pointer. Fully
5636 * initialized by this function based on parsed options.
5637 * Caller must release with rbd_spec_put().
5638 *
5639 * The options passed take this form:
5640 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5641 * where:
5642 * <mon_addrs>
5643 * A comma-separated list of one or more monitor addresses.
5644 * A monitor address is an ip address, optionally followed
5645 * by a port number (separated by a colon).
5646 * I.e.: ip1[:port1][,ip2[:port2]...]
5647 * <options>
5648 * A comma-separated list of ceph and/or rbd options.
5649 * <pool_name>
5650 * The name of the rados pool containing the rbd image.
5651 * <image_name>
5652 * The name of the image in that pool to map.
5653 * <snap_id>
5654 * An optional snapshot id. If provided, the mapping will
5655 * present data from the image at the time that snapshot was
5656 * created. The image head is used if no snapshot id is
5657 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005658 */
Alex Elder859c31d2012-10-25 23:34:42 -05005659static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005660 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005661 struct rbd_options **opts,
5662 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005663{
Alex Elderd22f76e2012-07-12 10:46:35 -05005664 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005665 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005666 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005667 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005668 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05005669 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005670 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005671 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005672 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005673
5674 /* The first four tokens are required */
5675
Alex Elder7ef32142012-02-02 08:13:30 -06005676 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005677 if (!len) {
5678 rbd_warn(NULL, "no monitor address(es) provided");
5679 return -EINVAL;
5680 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005681 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005682 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005683 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005684
Alex Elderdc79b112012-10-25 23:34:41 -05005685 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005686 options = dup_token(&buf, NULL);
5687 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005688 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005689 if (!*options) {
5690 rbd_warn(NULL, "no options provided");
5691 goto out_err;
5692 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005693
Alex Elder859c31d2012-10-25 23:34:42 -05005694 spec = rbd_spec_alloc();
5695 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005696 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005697
5698 spec->pool_name = dup_token(&buf, NULL);
5699 if (!spec->pool_name)
5700 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005701 if (!*spec->pool_name) {
5702 rbd_warn(NULL, "no pool name provided");
5703 goto out_err;
5704 }
Alex Eldere28fff262012-02-02 08:13:30 -06005705
Alex Elder69e7a022012-11-01 08:39:26 -05005706 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05005707 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005708 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005709 if (!*spec->image_name) {
5710 rbd_warn(NULL, "no image name provided");
5711 goto out_err;
5712 }
Alex Eldere28fff262012-02-02 08:13:30 -06005713
Alex Elderf28e5652012-10-25 23:34:41 -05005714 /*
5715 * Snapshot name is optional; default is to use "-"
5716 * (indicating the head/no snapshot).
5717 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005718 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005719 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005720 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5721 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005722 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005723 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005724 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005725 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005726 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5727 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005728 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005729 *(snap_name + len) = '\0';
5730 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005731
Alex Elder0ddebc02012-10-25 23:34:41 -05005732 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005733
Alex Elder4e9afeb2012-10-25 23:34:41 -05005734 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
5735 if (!rbd_opts)
5736 goto out_mem;
5737
5738 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Ilya Dryomovb5584182015-06-23 16:21:19 +03005739 rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov80de1912016-09-20 14:23:17 +02005740 rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005741
Alex Elder859c31d2012-10-25 23:34:42 -05005742 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05005743 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05005744 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05005745 if (IS_ERR(copts)) {
5746 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005747 goto out_err;
5748 }
Alex Elder859c31d2012-10-25 23:34:42 -05005749 kfree(options);
5750
5751 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005752 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05005753 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005754
Alex Elderdc79b112012-10-25 23:34:41 -05005755 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005756out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005757 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005758out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05005759 kfree(rbd_opts);
5760 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005761 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005762
Alex Elderdc79b112012-10-25 23:34:41 -05005763 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005764}
5765
Alex Elder589d30e2012-07-10 20:30:11 -05005766/*
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005767 * Return pool id (>= 0) or a negative error code.
5768 */
5769static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
5770{
Ilya Dryomova319bf52015-05-15 12:02:17 +03005771 struct ceph_options *opts = rbdc->client->options;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005772 u64 newest_epoch;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005773 int tries = 0;
5774 int ret;
5775
5776again:
5777 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
5778 if (ret == -ENOENT && tries++ < 1) {
Ilya Dryomovd0b19702016-04-28 16:07:27 +02005779 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
5780 &newest_epoch);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005781 if (ret < 0)
5782 return ret;
5783
5784 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
Ilya Dryomov7cca78c2016-04-28 16:07:28 +02005785 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005786 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
Ilya Dryomova319bf52015-05-15 12:02:17 +03005787 newest_epoch,
5788 opts->mount_timeout);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005789 goto again;
5790 } else {
5791 /* the osdmap we have is new enough */
5792 return -ENOENT;
5793 }
5794 }
5795
5796 return ret;
5797}
5798
5799/*
Alex Elder589d30e2012-07-10 20:30:11 -05005800 * An rbd format 2 image has a unique identifier, distinct from the
5801 * name given to it by the user. Internally, that identifier is
5802 * what's used to specify the names of objects related to the image.
5803 *
5804 * A special "rbd id" object is used to map an rbd image name to its
5805 * id. If that object doesn't exist, then there is no v2 rbd image
5806 * with the supplied name.
5807 *
5808 * This function will record the given rbd_dev's image_id field if
5809 * it can be determined, and in that case will return 0. If any
5810 * errors occur a negative errno will be returned and the rbd_dev's
5811 * image_id field will be unchanged (and should be NULL).
5812 */
5813static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5814{
5815 int ret;
5816 size_t size;
5817 char *object_name;
5818 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005819 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005820
Alex Elder589d30e2012-07-10 20:30:11 -05005821 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005822 * When probing a parent image, the image id is already
5823 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005824 * need to fetch the image id again in this case. We
5825 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005826 */
Alex Elderc0fba362013-04-25 23:15:08 -05005827 if (rbd_dev->spec->image_id) {
5828 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5829
Alex Elder2c0d0a12012-10-30 19:40:33 -05005830 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005831 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005832
5833 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005834 * First, see if the format 2 image id file exists, and if
5835 * so, get the image's persistent id from it.
5836 */
Alex Elder69e7a022012-11-01 08:39:26 -05005837 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05005838 object_name = kmalloc(size, GFP_NOIO);
5839 if (!object_name)
5840 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05005841 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05005842 dout("rbd id object name is %s\n", object_name);
5843
5844 /* Response will be an encoded string, which includes a length */
5845
5846 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5847 response = kzalloc(size, GFP_NOIO);
5848 if (!response) {
5849 ret = -ENOMEM;
5850 goto out;
5851 }
5852
Alex Elderc0fba362013-04-25 23:15:08 -05005853 /* If it doesn't exist we'll assume it's a format 1 image */
5854
Alex Elder36be9a72013-01-19 00:30:28 -06005855 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder41579762013-04-21 12:14:45 -05005856 "rbd", "get_id", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05005857 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005858 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005859 if (ret == -ENOENT) {
5860 image_id = kstrdup("", GFP_KERNEL);
5861 ret = image_id ? 0 : -ENOMEM;
5862 if (!ret)
5863 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005864 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005865 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005866
Alex Elderc0fba362013-04-25 23:15:08 -05005867 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005868 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005869 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005870 if (!ret)
5871 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005872 }
5873
5874 if (!ret) {
5875 rbd_dev->spec->image_id = image_id;
5876 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005877 }
5878out:
5879 kfree(response);
5880 kfree(object_name);
5881
5882 return ret;
5883}
5884
Alex Elder3abef3b2013-05-13 20:35:37 -05005885/*
5886 * Undo whatever state changes are made by v1 or v2 header info
5887 * call.
5888 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005889static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5890{
5891 struct rbd_image_header *header;
5892
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005893 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005894
5895 /* Free dynamic fields from the header, then zero it out */
5896
5897 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005898 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005899 kfree(header->snap_sizes);
5900 kfree(header->snap_names);
5901 kfree(header->object_prefix);
5902 memset(header, 0, sizeof (*header));
5903}
5904
Alex Elder2df3fac2013-05-06 09:51:30 -05005905static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005906{
5907 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005908
Alex Elder1e130192012-07-03 16:01:19 -05005909 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005910 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005911 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005912
Alex Elder2df3fac2013-05-06 09:51:30 -05005913 /*
5914 * Get the and check features for the image. Currently the
5915 * features are assumed to never change.
5916 */
Alex Elderb1b54022012-07-03 16:01:19 -05005917 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005918 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005919 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005920
Alex Eldercc070d52013-04-21 12:14:45 -05005921 /* If the image supports fancy striping, get its parameters */
5922
5923 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5924 ret = rbd_dev_v2_striping_info(rbd_dev);
5925 if (ret < 0)
5926 goto out_err;
5927 }
Alex Elder2df3fac2013-05-06 09:51:30 -05005928 /* No support for crypto and compression type format 2 images */
Alex Eldera30b71b2012-07-10 20:30:11 -05005929
Alex Elder35152972012-08-31 17:29:55 -05005930 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05005931out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005932 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005933 kfree(rbd_dev->header.object_prefix);
5934 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005935
5936 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005937}
5938
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005939/*
5940 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5941 * rbd_dev_image_probe() recursion depth, which means it's also the
5942 * length of the already discovered part of the parent chain.
5943 */
5944static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005945{
Alex Elder2f82ee52012-10-30 19:40:33 -05005946 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005947 int ret;
5948
5949 if (!rbd_dev->parent_spec)
5950 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005951
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005952 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5953 pr_info("parent chain is too long (%d)\n", depth);
5954 ret = -EINVAL;
5955 goto out_err;
5956 }
5957
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005958 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005959 if (!parent) {
5960 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005961 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005962 }
5963
5964 /*
5965 * Images related by parent/child relationships always share
5966 * rbd_client and spec/parent_spec, so bump their refcounts.
5967 */
5968 __rbd_get_client(rbd_dev->rbd_client);
5969 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005970
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005971 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005972 if (ret < 0)
5973 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005974
Alex Elder124afba2013-04-26 15:44:36 -05005975 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005976 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005977 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005978
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005979out_err:
5980 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005981 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005982 return ret;
5983}
5984
Ilya Dryomov811c6682016-04-15 16:22:16 +02005985/*
5986 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5987 * upon return.
5988 */
Alex Elder200a6a82013-04-28 23:32:34 -05005989static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005990{
Alex Elder83a06262012-10-30 15:47:17 -05005991 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005992
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005993 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005994
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005995 if (!single_major) {
5996 ret = register_blkdev(0, rbd_dev->name);
5997 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005998 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005999
6000 rbd_dev->major = ret;
6001 rbd_dev->minor = 0;
6002 } else {
6003 rbd_dev->major = rbd_major;
6004 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6005 }
Alex Elder83a06262012-10-30 15:47:17 -05006006
6007 /* Set up the blkdev mapping. */
6008
6009 ret = rbd_init_disk(rbd_dev);
6010 if (ret)
6011 goto err_out_blkdev;
6012
Alex Elderf35a4de2013-05-06 09:51:29 -05006013 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05006014 if (ret)
6015 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04006016
Alex Elderf35a4de2013-05-06 09:51:29 -05006017 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Josh Durgin22001f62013-09-30 20:10:04 -07006018 set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05006019
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006020 dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6021 ret = device_add(&rbd_dev->dev);
Alex Elderf35a4de2013-05-06 09:51:29 -05006022 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006023 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05006024
Alex Elder83a06262012-10-30 15:47:17 -05006025 /* Everything's ready. Announce the disk to the world. */
6026
Alex Elder129b79d2013-04-26 15:44:36 -05006027 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006028 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05006029
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006030 spin_lock(&rbd_dev_list_lock);
6031 list_add_tail(&rbd_dev->node, &rbd_dev_list);
6032 spin_unlock(&rbd_dev_list_lock);
6033
Ilya Dryomov811c6682016-04-15 16:22:16 +02006034 add_disk(rbd_dev->disk);
Ilya Dryomovca7909e2016-08-18 18:38:41 +02006035 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
6036 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
6037 rbd_dev->header.features);
Alex Elder83a06262012-10-30 15:47:17 -05006038
6039 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05006040
Alex Elderf35a4de2013-05-06 09:51:29 -05006041err_out_mapping:
6042 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05006043err_out_disk:
6044 rbd_free_disk(rbd_dev);
6045err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006046 if (!single_major)
6047 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006048err_out_unlock:
6049 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05006050 return ret;
6051}
6052
Alex Elder332bb122013-04-27 09:59:30 -05006053static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6054{
6055 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006056 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05006057
6058 /* Record the header object name for this rbd image. */
6059
6060 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6061
Yan, Zheng76271512016-02-03 21:24:49 +08006062 rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
Alex Elder332bb122013-04-27 09:59:30 -05006063 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006064 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6065 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05006066 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006067 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6068 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05006069
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006070 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05006071}
6072
Alex Elder200a6a82013-04-28 23:32:34 -05006073static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6074{
Alex Elder6fd48b32013-04-28 23:32:34 -05006075 rbd_dev_unprobe(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006076 rbd_dev->image_format = 0;
6077 kfree(rbd_dev->spec->image_id);
6078 rbd_dev->spec->image_id = NULL;
6079
Alex Elder200a6a82013-04-28 23:32:34 -05006080 rbd_dev_destroy(rbd_dev);
6081}
6082
Alex Eldera30b71b2012-07-10 20:30:11 -05006083/*
6084 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05006085 * device. If this image is the one being mapped (i.e., not a
6086 * parent), initiate a watch on its header object before using that
6087 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05006088 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006089static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05006090{
6091 int ret;
6092
6093 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05006094 * Get the id from the image id object. Unless there's an
6095 * error, rbd_dev->spec->image_id will be filled in with
6096 * a dynamically-allocated string, and rbd_dev->image_format
6097 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05006098 */
6099 ret = rbd_dev_image_id(rbd_dev);
6100 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05006101 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05006102
Alex Elder332bb122013-04-27 09:59:30 -05006103 ret = rbd_dev_header_name(rbd_dev);
6104 if (ret)
6105 goto err_out_format;
6106
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006107 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02006108 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006109 if (ret) {
6110 if (ret == -ENOENT)
6111 pr_info("image %s/%s does not exist\n",
6112 rbd_dev->spec->pool_name,
6113 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006114 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006115 }
Alex Elder1f3ef782013-05-06 17:40:33 -05006116 }
Alex Elderb644de22013-04-27 09:59:31 -05006117
Ilya Dryomova720ae02014-07-23 17:11:19 +04006118 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05006119 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05006120 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05006121
Ilya Dryomov04077592014-07-23 17:11:20 +04006122 /*
6123 * If this image is the one being mapped, we have pool name and
6124 * id, image name and id, and snap name - need to fill snap id.
6125 * Otherwise this is a parent image, identified by pool, image
6126 * and snap ids - need to fill in names for those ids.
6127 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006128 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04006129 ret = rbd_spec_fill_snap_id(rbd_dev);
6130 else
6131 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006132 if (ret) {
6133 if (ret == -ENOENT)
6134 pr_info("snap %s/%s@%s does not exist\n",
6135 rbd_dev->spec->pool_name,
6136 rbd_dev->spec->image_name,
6137 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05006138 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006139 }
Alex Elder9bb81c92013-04-27 09:59:30 -05006140
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006141 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6142 ret = rbd_dev_v2_parent_info(rbd_dev);
6143 if (ret)
6144 goto err_out_probe;
6145
6146 /*
6147 * Need to warn users if this image is the one being
6148 * mapped and has a parent.
6149 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006150 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006151 rbd_warn(rbd_dev,
6152 "WARNING: kernel layering is EXPERIMENTAL!");
6153 }
6154
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006155 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05006156 if (ret)
6157 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05006158
Alex Elder30d60ba2013-05-06 09:51:30 -05006159 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006160 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05006161 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006162
Alex Elder6fd48b32013-04-28 23:32:34 -05006163err_out_probe:
6164 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05006165err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006166 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02006167 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05006168err_out_format:
6169 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05006170 kfree(rbd_dev->spec->image_id);
6171 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05006172 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006173}
6174
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006175static ssize_t do_rbd_add(struct bus_type *bus,
6176 const char *buf,
6177 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006178{
Alex Eldercb8627c2012-07-09 21:04:23 -05006179 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05006180 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05006181 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05006182 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05006183 struct rbd_client *rbdc;
Alex Elder51344a32013-05-06 07:40:30 -05006184 bool read_only;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006185 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006186
6187 if (!try_module_get(THIS_MODULE))
6188 return -ENODEV;
6189
Alex Eldera725f65e2012-02-02 08:13:30 -06006190 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05006191 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05006192 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006193 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06006194
Alex Elder9d3997f2012-10-25 23:34:42 -05006195 rbdc = rbd_get_client(ceph_opts);
6196 if (IS_ERR(rbdc)) {
6197 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006198 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05006199 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006200
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006201 /* pick the pool */
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006202 rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006203 if (rc < 0) {
6204 if (rc == -ENOENT)
6205 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006206 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006207 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05006208 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05006209
Ilya Dryomovd1475432015-06-22 13:24:48 +03006210 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006211 if (!rbd_dev) {
6212 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05006213 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006214 }
Alex Elderc53d5892012-10-25 23:34:42 -05006215 rbdc = NULL; /* rbd_dev now owns this */
6216 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03006217 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006218
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006219 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
6220 if (!rbd_dev->config_info) {
6221 rc = -ENOMEM;
6222 goto err_out_rbd_dev;
6223 }
6224
Ilya Dryomov811c6682016-04-15 16:22:16 +02006225 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006226 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006227 if (rc < 0) {
6228 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05006229 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006230 }
Alex Elder05fd6f62012-08-29 17:11:07 -05006231
Alex Elder7ce4eef2013-05-06 17:40:33 -05006232 /* If we are mapping a snapshot it must be marked read-only */
6233
Ilya Dryomovd1475432015-06-22 13:24:48 +03006234 read_only = rbd_dev->opts->read_only;
Alex Elder7ce4eef2013-05-06 17:40:33 -05006235 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
6236 read_only = true;
6237 rbd_dev->mapping.read_only = read_only;
6238
Alex Elderb536f692013-04-28 23:32:34 -05006239 rc = rbd_dev_device_setup(rbd_dev);
Alex Elder3abef3b2013-05-13 20:35:37 -05006240 if (rc) {
Ilya Dryomove37180c2013-12-16 18:02:41 +02006241 /*
Ilya Dryomov99d16942016-08-12 16:11:41 +02006242 * rbd_unregister_watch() can't be moved into
Ilya Dryomove37180c2013-12-16 18:02:41 +02006243 * rbd_dev_image_release() without refactoring, see
6244 * commit 1f3ef78861ac.
6245 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02006246 rbd_unregister_watch(rbd_dev);
Alex Elder3abef3b2013-05-13 20:35:37 -05006247 rbd_dev_image_release(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006248 goto out;
Alex Elder3abef3b2013-05-13 20:35:37 -05006249 }
Alex Elderb536f692013-04-28 23:32:34 -05006250
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006251 rc = count;
6252out:
6253 module_put(THIS_MODULE);
6254 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05006255
Alex Elderc53d5892012-10-25 23:34:42 -05006256err_out_rbd_dev:
6257 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05006258err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05006259 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006260err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05006261 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03006262 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006263 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006264}
6265
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006266static ssize_t rbd_add(struct bus_type *bus,
6267 const char *buf,
6268 size_t count)
6269{
6270 if (single_major)
6271 return -EINVAL;
6272
6273 return do_rbd_add(bus, buf, count);
6274}
6275
6276static ssize_t rbd_add_single_major(struct bus_type *bus,
6277 const char *buf,
6278 size_t count)
6279{
6280 return do_rbd_add(bus, buf, count);
6281}
6282
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006283static void rbd_dev_device_release(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006284{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006285 rbd_free_disk(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006286
6287 spin_lock(&rbd_dev_list_lock);
6288 list_del_init(&rbd_dev->node);
6289 spin_unlock(&rbd_dev_list_lock);
6290
Alex Elder200a6a82013-04-28 23:32:34 -05006291 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006292 device_del(&rbd_dev->dev);
Alex Elder6d80b132013-05-06 07:40:30 -05006293 rbd_dev_mapping_clear(rbd_dev);
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006294 if (!single_major)
6295 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006296}
6297
Alex Elder05a46af2013-04-26 15:44:36 -05006298static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6299{
Alex Elderad945fc2013-04-26 15:44:36 -05006300 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05006301 struct rbd_device *first = rbd_dev;
6302 struct rbd_device *second = first->parent;
6303 struct rbd_device *third;
6304
6305 /*
6306 * Follow to the parent with no grandparent and
6307 * remove it.
6308 */
6309 while (second && (third = second->parent)) {
6310 first = second;
6311 second = third;
6312 }
Alex Elderad945fc2013-04-26 15:44:36 -05006313 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006314 rbd_dev_image_release(second);
Alex Elderad945fc2013-04-26 15:44:36 -05006315 first->parent = NULL;
6316 first->parent_overlap = 0;
6317
6318 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05006319 rbd_spec_put(first->parent_spec);
6320 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05006321 }
6322}
6323
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006324static ssize_t do_rbd_remove(struct bus_type *bus,
6325 const char *buf,
6326 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006327{
6328 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05006329 struct list_head *tmp;
6330 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02006331 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05006332 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02006333 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05006334 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006335
Mike Christie0276dca2016-08-18 18:38:45 +02006336 dev_id = -1;
6337 opt_buf[0] = '\0';
6338 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6339 if (dev_id < 0) {
6340 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006341 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02006342 }
6343 if (opt_buf[0] != '\0') {
6344 if (!strcmp(opt_buf, "force")) {
6345 force = true;
6346 } else {
6347 pr_err("bad remove option at '%s'\n", opt_buf);
6348 return -EINVAL;
6349 }
6350 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006351
Alex Elder751cc0e2013-05-31 15:17:01 -05006352 ret = -ENOENT;
6353 spin_lock(&rbd_dev_list_lock);
6354 list_for_each(tmp, &rbd_dev_list) {
6355 rbd_dev = list_entry(tmp, struct rbd_device, node);
6356 if (rbd_dev->dev_id == dev_id) {
6357 ret = 0;
6358 break;
6359 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006360 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006361 if (!ret) {
6362 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006363 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006364 ret = -EBUSY;
6365 else
Alex Elder82a442d2013-05-31 17:40:44 -05006366 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6367 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05006368 spin_unlock_irq(&rbd_dev->lock);
6369 }
6370 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05006371 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006372 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006373
Mike Christie0276dca2016-08-18 18:38:45 +02006374 if (force) {
6375 /*
6376 * Prevent new IO from being queued and wait for existing
6377 * IO to complete/fail.
6378 */
6379 blk_mq_freeze_queue(rbd_dev->disk->queue);
6380 blk_set_queue_dying(rbd_dev->disk->queue);
6381 }
6382
Ilya Dryomoved95b212016-08-12 16:40:02 +02006383 down_write(&rbd_dev->lock_rwsem);
6384 if (__rbd_is_lock_owner(rbd_dev))
6385 rbd_unlock(rbd_dev);
6386 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomov99d16942016-08-12 16:11:41 +02006387 rbd_unregister_watch(rbd_dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006388
Josh Durgin98752012013-08-29 17:26:31 -07006389 /*
6390 * Don't free anything from rbd_dev->disk until after all
6391 * notifies are completely processed. Otherwise
6392 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
6393 * in a potential use after free of rbd_dev->disk or rbd_dev.
6394 */
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006395 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006396 rbd_dev_image_release(rbd_dev);
Alex Elderaafb2302012-09-06 16:00:54 -05006397
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006398 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006399}
6400
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006401static ssize_t rbd_remove(struct bus_type *bus,
6402 const char *buf,
6403 size_t count)
6404{
6405 if (single_major)
6406 return -EINVAL;
6407
6408 return do_rbd_remove(bus, buf, count);
6409}
6410
6411static ssize_t rbd_remove_single_major(struct bus_type *bus,
6412 const char *buf,
6413 size_t count)
6414{
6415 return do_rbd_remove(bus, buf, count);
6416}
6417
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006418/*
6419 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006420 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006421 */
6422static int rbd_sysfs_init(void)
6423{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006424 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006425
Alex Elderfed4c142012-02-07 12:03:36 -06006426 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006427 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006428 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006429
Alex Elderfed4c142012-02-07 12:03:36 -06006430 ret = bus_register(&rbd_bus_type);
6431 if (ret < 0)
6432 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006433
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006434 return ret;
6435}
6436
6437static void rbd_sysfs_cleanup(void)
6438{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006439 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006440 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006441}
6442
Alex Elder1c2a9df2013-05-01 12:43:03 -05006443static int rbd_slab_init(void)
6444{
6445 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006446 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006447 if (!rbd_img_request_cache)
6448 return -ENOMEM;
6449
6450 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006451 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006452 if (!rbd_obj_request_cache)
6453 goto out_err;
6454
6455 rbd_assert(!rbd_segment_name_cache);
6456 rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
Ilya Dryomov2d0ebc52014-01-27 17:40:18 +02006457 CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
Alex Elder78c2a442013-05-01 12:43:04 -05006458 if (rbd_segment_name_cache)
Alex Elder1c2a9df2013-05-01 12:43:03 -05006459 return 0;
Alex Elder78c2a442013-05-01 12:43:04 -05006460out_err:
Julia Lawall13bf2832015-09-13 14:15:26 +02006461 kmem_cache_destroy(rbd_obj_request_cache);
6462 rbd_obj_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006463
Alex Elder868311b2013-05-01 12:43:03 -05006464 kmem_cache_destroy(rbd_img_request_cache);
6465 rbd_img_request_cache = NULL;
6466
Alex Elder1c2a9df2013-05-01 12:43:03 -05006467 return -ENOMEM;
6468}
6469
6470static void rbd_slab_exit(void)
6471{
Alex Elder78c2a442013-05-01 12:43:04 -05006472 rbd_assert(rbd_segment_name_cache);
6473 kmem_cache_destroy(rbd_segment_name_cache);
6474 rbd_segment_name_cache = NULL;
6475
Alex Elder868311b2013-05-01 12:43:03 -05006476 rbd_assert(rbd_obj_request_cache);
6477 kmem_cache_destroy(rbd_obj_request_cache);
6478 rbd_obj_request_cache = NULL;
6479
Alex Elder1c2a9df2013-05-01 12:43:03 -05006480 rbd_assert(rbd_img_request_cache);
6481 kmem_cache_destroy(rbd_img_request_cache);
6482 rbd_img_request_cache = NULL;
6483}
6484
Alex Eldercc344fa2013-02-19 12:25:56 -06006485static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006486{
6487 int rc;
6488
Alex Elder1e32d342013-01-30 11:13:33 -06006489 if (!libceph_compatible(NULL)) {
6490 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006491 return -EINVAL;
6492 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006493
Alex Elder1c2a9df2013-05-01 12:43:03 -05006494 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006495 if (rc)
6496 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006497
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006498 /*
6499 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006500 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006501 */
6502 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6503 if (!rbd_wq) {
6504 rc = -ENOMEM;
6505 goto err_out_slab;
6506 }
6507
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006508 if (single_major) {
6509 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6510 if (rbd_major < 0) {
6511 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006512 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006513 }
6514 }
6515
Alex Elder1c2a9df2013-05-01 12:43:03 -05006516 rc = rbd_sysfs_init();
6517 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006518 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006519
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006520 if (single_major)
6521 pr_info("loaded (major %d)\n", rbd_major);
6522 else
6523 pr_info("loaded\n");
6524
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006525 return 0;
6526
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006527err_out_blkdev:
6528 if (single_major)
6529 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006530err_out_wq:
6531 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006532err_out_slab:
6533 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006534 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006535}
6536
Alex Eldercc344fa2013-02-19 12:25:56 -06006537static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006538{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006539 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006540 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006541 if (single_major)
6542 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006543 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006544 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006545}
6546
6547module_init(rbd_init);
6548module_exit(rbd_exit);
6549
Alex Elderd552c612013-05-31 20:13:09 -05006550MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006551MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6552MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006553/* following authorship retained from original osdblk.c */
6554MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6555
Ilya Dryomov90da2582013-12-13 15:28:56 +02006556MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006557MODULE_LICENSE("GPL");