blob: 53b1ced21a139ac47f13e096e32299c23b0f0794 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070036#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050037#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070038
39#include <linux/kernel.h>
40#include <linux/device.h>
41#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010042#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070043#include <linux/fs.h>
44#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050045#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020046#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040047#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070048
49#include "rbd_types.h"
50
Alex Elderaafb2302012-09-06 16:00:54 -050051#define RBD_DEBUG /* Activate rbd_assert() calls */
52
Alex Elder593a9e72012-02-07 12:03:37 -060053/*
54 * The basic unit of block I/O is a sector. It is interpreted in a
55 * number of contexts in Linux (blk, bio, genhd), but the default is
56 * universally 512 bytes. These symbols are just slightly more
57 * meaningful than the bare numbers they represent.
58 */
59#define SECTOR_SHIFT 9
60#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
61
Alex Eldera2acd002013-05-08 22:50:04 -050062/*
63 * Increment the given counter and return its updated value.
64 * If the counter is already 0 it will not be incremented.
65 * If the counter is already at its maximum value returns
66 * -EINVAL without updating it.
67 */
68static int atomic_inc_return_safe(atomic_t *v)
69{
70 unsigned int counter;
71
72 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73 if (counter <= (unsigned int)INT_MAX)
74 return (int)counter;
75
76 atomic_dec(v);
77
78 return -EINVAL;
79}
80
81/* Decrement the counter. Return the resulting value, or -EINVAL */
82static int atomic_dec_return_safe(atomic_t *v)
83{
84 int counter;
85
86 counter = atomic_dec_return(v);
87 if (counter >= 0)
88 return counter;
89
90 atomic_inc(v);
91
92 return -EINVAL;
93}
94
Alex Elderf0f8cef2012-01-29 13:57:44 -060095#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096
Ilya Dryomov7e513d42013-12-16 19:26:32 +020097#define RBD_MINORS_PER_MAJOR 256
98#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200100#define RBD_MAX_PARENT_CHAIN_LEN 16
101
Alex Elderd4b125e2012-07-03 16:01:19 -0500102#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
103#define RBD_MAX_SNAP_NAME_LEN \
104 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105
Alex Elder35d489f2012-07-03 16:01:19 -0500106#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700107
108#define RBD_SNAP_HEAD_NAME "-"
109
Alex Elder9682fc62013-04-30 00:44:33 -0500110#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
111
Alex Elder9e15b772012-10-30 19:40:33 -0500112/* This allows a single page to hold an image name sent by OSD */
113#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500114#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500115
Alex Elder1e130192012-07-03 16:01:19 -0500116#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500117
Ilya Dryomoved95b212016-08-12 16:40:02 +0200118#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200119#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
120
Alex Elderd8891402012-10-09 13:50:17 -0700121/* Feature bits */
122
Ilya Dryomov8767b292017-03-02 19:56:57 +0100123#define RBD_FEATURE_LAYERING (1ULL<<0)
124#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
125#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
126#define RBD_FEATURE_DATA_POOL (1ULL<<7)
127
Ilya Dryomoved95b212016-08-12 16:40:02 +0200128#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
129 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100130 RBD_FEATURE_EXCLUSIVE_LOCK | \
131 RBD_FEATURE_DATA_POOL)
Alex Elderd8891402012-10-09 13:50:17 -0700132
133/* Features supported by this (client software) implementation. */
134
Alex Elder770eba62012-10-25 23:34:40 -0500135#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700136
Alex Elder81a89792012-02-02 08:13:30 -0600137/*
138 * An RBD device name will be "rbd#", where the "rbd" comes from
139 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600140 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700141#define DEV_NAME_LEN 32
142
143/*
144 * block device image metadata (in-memory version)
145 */
146struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500147 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500148 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700149 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500150 u64 stripe_unit;
151 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100152 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500153 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154
Alex Elderf84344f2012-08-31 17:29:51 -0500155 /* The remaining fields need to be updated occasionally */
156 u64 image_size;
157 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500158 char *snap_names; /* format 1 only */
159 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700160};
161
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500162/*
163 * An rbd image specification.
164 *
165 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500166 * identify an image. Each rbd_dev structure includes a pointer to
167 * an rbd_spec structure that encapsulates this identity.
168 *
169 * Each of the id's in an rbd_spec has an associated name. For a
170 * user-mapped image, the names are supplied and the id's associated
171 * with them are looked up. For a layered image, a parent image is
172 * defined by the tuple, and the names are looked up.
173 *
174 * An rbd_dev structure contains a parent_spec pointer which is
175 * non-null if the image it represents is a child in a layered
176 * image. This pointer will refer to the rbd_spec structure used
177 * by the parent rbd_dev for its own identity (i.e., the structure
178 * is shared between the parent and child).
179 *
180 * Since these structures are populated once, during the discovery
181 * phase of image construction, they are effectively immutable so
182 * we make no effort to synchronize access to them.
183 *
184 * Note that code herein does not assume the image name is known (it
185 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500186 */
187struct rbd_spec {
188 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500189 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500190
Alex Elderecb4dc22013-04-26 09:43:47 -0500191 const char *image_id;
192 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500193
194 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500195 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500196
197 struct kref kref;
198};
199
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700200/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600201 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700202 */
203struct rbd_client {
204 struct ceph_client *client;
205 struct kref kref;
206 struct list_head node;
207};
208
Alex Elderbf0d5f502012-11-22 00:00:08 -0600209struct rbd_img_request;
210typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
211
212#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
213
214struct rbd_obj_request;
215typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
216
Alex Elder9969ebc2013-01-18 12:31:10 -0600217enum obj_request_type {
218 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
219};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600220
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800221enum obj_operation_type {
222 OBJ_OP_WRITE,
223 OBJ_OP_READ,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800224 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800225};
226
Alex Elder926f9b32013-02-11 12:33:24 -0600227enum obj_req_flags {
228 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600229 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600230 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
231 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600232};
233
Alex Elderbf0d5f502012-11-22 00:00:08 -0600234struct rbd_obj_request {
Ilya Dryomova90bb0c2017-01-25 18:16:23 +0100235 u64 object_no;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600236 u64 offset; /* object start byte */
237 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600238 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600239
Alex Elderc5b5ef62013-02-11 12:33:24 -0600240 /*
241 * An object request associated with an image will have its
242 * img_data flag set; a standalone object request will not.
243 *
244 * A standalone object request will have which == BAD_WHICH
245 * and a null obj_request pointer.
246 *
247 * An object request initiated in support of a layered image
248 * object (to check for its existence before a write) will
249 * have which == BAD_WHICH and a non-null obj_request pointer.
250 *
251 * Finally, an object request for rbd image data will have
252 * which != BAD_WHICH, and will have a non-null img_request
253 * pointer. The value of which will be in the range
254 * 0..(img_request->obj_request_count-1).
255 */
256 union {
257 struct rbd_obj_request *obj_request; /* STAT op */
258 struct {
259 struct rbd_img_request *img_request;
260 u64 img_offset;
261 /* links for img_request->obj_requests list */
262 struct list_head links;
263 };
264 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600265 u32 which; /* posn image request list */
266
267 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600268 union {
269 struct bio *bio_list;
270 struct {
271 struct page **pages;
272 u32 page_count;
273 };
274 };
Alex Elder0eefd472013-04-19 15:34:50 -0500275 struct page **copyup_pages;
Alex Elderebda6402013-05-10 16:29:22 -0500276 u32 copyup_page_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600277
278 struct ceph_osd_request *osd_req;
279
280 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800281 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600282
283 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600284 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600285
286 struct kref kref;
287};
288
Alex Elder0c425242013-02-08 09:55:49 -0600289enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600290 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
291 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600292 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800293 IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600294};
295
Alex Elderbf0d5f502012-11-22 00:00:08 -0600296struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600297 struct rbd_device *rbd_dev;
298 u64 offset; /* starting image byte offset */
299 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600300 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600301 union {
Alex Elder9849e982013-01-24 16:13:36 -0600302 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600303 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600304 };
305 union {
306 struct request *rq; /* block request */
307 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600308 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500309 struct page **copyup_pages;
Alex Elderebda6402013-05-10 16:29:22 -0500310 u32 copyup_page_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600311 spinlock_t completion_lock;/* protects next_completion */
312 u32 next_completion;
313 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500314 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600315 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600316
317 u32 obj_request_count;
318 struct list_head obj_requests; /* rbd_obj_request structs */
319
320 struct kref kref;
321};
322
323#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600324 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600325#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600326 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600327#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600328 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600329
Ilya Dryomov99d16942016-08-12 16:11:41 +0200330enum rbd_watch_state {
331 RBD_WATCH_STATE_UNREGISTERED,
332 RBD_WATCH_STATE_REGISTERED,
333 RBD_WATCH_STATE_ERROR,
334};
335
Ilya Dryomoved95b212016-08-12 16:40:02 +0200336enum rbd_lock_state {
337 RBD_LOCK_STATE_UNLOCKED,
338 RBD_LOCK_STATE_LOCKED,
339 RBD_LOCK_STATE_RELEASING,
340};
341
342/* WatchNotify::ClientId */
343struct rbd_client_id {
344 u64 gid;
345 u64 handle;
346};
347
Alex Elderf84344f2012-08-31 17:29:51 -0500348struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500349 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500350 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500351};
352
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700353/*
354 * a single device
355 */
356struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500357 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358
359 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200360 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700361 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700362
Alex Eldera30b71b2012-07-10 20:30:11 -0500363 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364 struct rbd_client *rbd_client;
365
366 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
367
Alex Elderb82d1672013-01-14 12:43:31 -0600368 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369
370 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600371 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500372 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300373 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200374 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700375
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200376 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200377 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500378
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200379 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600380
Ilya Dryomov99d16942016-08-12 16:11:41 +0200381 struct mutex watch_mutex;
382 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200383 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200384 u64 watch_cookie;
385 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700386
Ilya Dryomoved95b212016-08-12 16:40:02 +0200387 struct rw_semaphore lock_rwsem;
388 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200389 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200390 struct rbd_client_id owner_cid;
391 struct work_struct acquired_lock_work;
392 struct work_struct released_lock_work;
393 struct delayed_work lock_dwork;
394 struct work_struct unlock_work;
395 wait_queue_head_t lock_waitq;
396
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200397 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700398
Alex Elder86b00e02012-10-25 23:34:42 -0500399 struct rbd_spec *parent_spec;
400 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500401 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500402 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500403
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100404 /* Block layer tags. */
405 struct blk_mq_tag_set tag_set;
406
Josh Durginc6666012011-11-21 17:11:12 -0800407 /* protects updating the header */
408 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500409
410 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700411
412 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800413
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800414 /* sysfs related */
415 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600416 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800417};
418
Alex Elderb82d1672013-01-14 12:43:31 -0600419/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200420 * Flag bits for rbd_dev->flags:
421 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
422 * by rbd_dev->lock
423 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600424 */
Alex Elder6d292902013-01-14 12:43:31 -0600425enum rbd_dev_flags {
426 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600427 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200428 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600429};
430
Alex Eldercfbf6372013-05-31 17:40:45 -0500431static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600432
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600434static DEFINE_SPINLOCK(rbd_dev_list_lock);
435
Alex Elder432b8582012-01-29 13:57:44 -0600436static LIST_HEAD(rbd_client_list); /* clients */
437static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438
Alex Elder78c2a442013-05-01 12:43:04 -0500439/* Slab caches for frequently-allocated structures */
440
Alex Elder1c2a9df2013-05-01 12:43:03 -0500441static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500442static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500443
NeilBrownf856dc32017-06-18 14:38:58 +1000444static struct bio_set *rbd_bio_clone;
445
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200446static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200447static DEFINE_IDA(rbd_dev_id_ida);
448
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400449static struct workqueue_struct *rbd_wq;
450
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200451/*
452 * Default to false for now, as single-major requires >= 0.75 version of
453 * userspace rbd utility.
454 */
455static bool single_major = false;
456module_param(single_major, bool, S_IRUGO);
457MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
458
Alex Elder3d7efd12013-04-19 15:34:50 -0500459static int rbd_img_request_submit(struct rbd_img_request *img_request);
460
Alex Elderf0f8cef2012-01-29 13:57:44 -0600461static ssize_t rbd_add(struct bus_type *bus, const char *buf,
462 size_t count);
463static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
464 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200465static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
466 size_t count);
467static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
468 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200469static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Eldera2acd002013-05-08 22:50:04 -0500470static void rbd_spec_put(struct rbd_spec *spec);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600471
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200472static int rbd_dev_id_to_minor(int dev_id)
473{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200474 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200475}
476
477static int minor_to_rbd_dev_id(int minor)
478{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200479 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200480}
481
Ilya Dryomoved95b212016-08-12 16:40:02 +0200482static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
483{
484 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
485 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
486}
487
488static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
489{
490 bool is_lock_owner;
491
492 down_read(&rbd_dev->lock_rwsem);
493 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
494 up_read(&rbd_dev->lock_rwsem);
495 return is_lock_owner;
496}
497
Ilya Dryomov8767b292017-03-02 19:56:57 +0100498static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
499{
500 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
501}
502
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700503static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
504static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200505static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
506static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Ilya Dryomov8767b292017-03-02 19:56:57 +0100507static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700508
509static struct attribute *rbd_bus_attrs[] = {
510 &bus_attr_add.attr,
511 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200512 &bus_attr_add_single_major.attr,
513 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100514 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700515 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600516};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200517
518static umode_t rbd_bus_is_visible(struct kobject *kobj,
519 struct attribute *attr, int index)
520{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200521 if (!single_major &&
522 (attr == &bus_attr_add_single_major.attr ||
523 attr == &bus_attr_remove_single_major.attr))
524 return 0;
525
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200526 return attr->mode;
527}
528
529static const struct attribute_group rbd_bus_group = {
530 .attrs = rbd_bus_attrs,
531 .is_visible = rbd_bus_is_visible,
532};
533__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600534
535static struct bus_type rbd_bus_type = {
536 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700537 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600538};
539
540static void rbd_root_dev_release(struct device *dev)
541{
542}
543
544static struct device rbd_root_dev = {
545 .init_name = "rbd",
546 .release = rbd_root_dev_release,
547};
548
Alex Elder06ecc6c2012-11-01 10:17:15 -0500549static __printf(2, 3)
550void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
551{
552 struct va_format vaf;
553 va_list args;
554
555 va_start(args, fmt);
556 vaf.fmt = fmt;
557 vaf.va = &args;
558
559 if (!rbd_dev)
560 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
561 else if (rbd_dev->disk)
562 printk(KERN_WARNING "%s: %s: %pV\n",
563 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
564 else if (rbd_dev->spec && rbd_dev->spec->image_name)
565 printk(KERN_WARNING "%s: image %s: %pV\n",
566 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
567 else if (rbd_dev->spec && rbd_dev->spec->image_id)
568 printk(KERN_WARNING "%s: id %s: %pV\n",
569 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
570 else /* punt */
571 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
572 RBD_DRV_NAME, rbd_dev, &vaf);
573 va_end(args);
574}
575
Alex Elderaafb2302012-09-06 16:00:54 -0500576#ifdef RBD_DEBUG
577#define rbd_assert(expr) \
578 if (unlikely(!(expr))) { \
579 printk(KERN_ERR "\nAssertion failure in %s() " \
580 "at line %d:\n\n" \
581 "\trbd_assert(%s);\n\n", \
582 __func__, __LINE__, #expr); \
583 BUG(); \
584 }
585#else /* !RBD_DEBUG */
586# define rbd_assert(expr) ((void) 0)
587#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800588
Ilya Dryomov27617132015-07-16 17:36:11 +0300589static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
Alex Elderb454e362013-04-19 15:34:50 -0500590static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder05a46af2013-04-26 15:44:36 -0500591static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
592static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600593
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500594static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500595static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400596static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400597static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500598static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
599 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500600static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
601 u8 *order, u64 *snap_size);
602static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
603 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700604
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605static int rbd_open(struct block_device *bdev, fmode_t mode)
606{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600607 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600608 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609
Alex Eldera14ea262013-02-05 13:23:12 -0600610 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600611 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
612 removing = true;
613 else
614 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600615 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600616 if (removing)
617 return -ENOENT;
618
Alex Elderc3e946c2012-11-16 09:29:16 -0600619 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700620
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621 return 0;
622}
623
Al Virodb2a1442013-05-05 21:52:57 -0400624static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800625{
626 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600627 unsigned long open_count_before;
628
Alex Eldera14ea262013-02-05 13:23:12 -0600629 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600630 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600631 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600632 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800633
Alex Elderc3e946c2012-11-16 09:29:16 -0600634 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800635}
636
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800637static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
638{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200639 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800640
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200641 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800642 return -EFAULT;
643
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200644 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800645 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
646 return -EROFS;
647
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200648 /* Let blkdev_roset() handle it */
649 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800650}
651
652static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
653 unsigned int cmd, unsigned long arg)
654{
655 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200656 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800657
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800658 switch (cmd) {
659 case BLKROSET:
660 ret = rbd_ioctl_set_ro(rbd_dev, arg);
661 break;
662 default:
663 ret = -ENOTTY;
664 }
665
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800666 return ret;
667}
668
669#ifdef CONFIG_COMPAT
670static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
671 unsigned int cmd, unsigned long arg)
672{
673 return rbd_ioctl(bdev, mode, cmd, arg);
674}
675#endif /* CONFIG_COMPAT */
676
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677static const struct block_device_operations rbd_bd_ops = {
678 .owner = THIS_MODULE,
679 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800680 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800681 .ioctl = rbd_ioctl,
682#ifdef CONFIG_COMPAT
683 .compat_ioctl = rbd_compat_ioctl,
684#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685};
686
687/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500688 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500689 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690 */
Alex Elderf8c38922012-08-10 13:12:07 -0700691static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692{
693 struct rbd_client *rbdc;
694 int ret = -ENOMEM;
695
Alex Elder37206ee2013-02-20 17:32:08 -0600696 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
698 if (!rbdc)
699 goto out_opt;
700
701 kref_init(&rbdc->kref);
702 INIT_LIST_HEAD(&rbdc->node);
703
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100704 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500706 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500707 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708
709 ret = ceph_open_session(rbdc->client);
710 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500711 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712
Alex Elder432b8582012-01-29 13:57:44 -0600713 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700714 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600715 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
Alex Elder37206ee2013-02-20 17:32:08 -0600717 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600718
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500720out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500722out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723 kfree(rbdc);
724out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500725 if (ceph_opts)
726 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600727 dout("%s: error %d\n", __func__, ret);
728
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400729 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700730}
731
Alex Elder2f82ee52012-10-30 19:40:33 -0500732static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
733{
734 kref_get(&rbdc->kref);
735
736 return rbdc;
737}
738
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700739/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700740 * Find a ceph client with specific addr and configuration. If
741 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700743static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744{
745 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700746 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700747
Alex Elder43ae4702012-07-03 16:01:18 -0500748 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700749 return NULL;
750
Alex Elder1f7ba332012-08-10 13:12:07 -0700751 spin_lock(&rbd_client_list_lock);
752 list_for_each_entry(client_node, &rbd_client_list, node) {
753 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500754 __rbd_get_client(client_node);
755
Alex Elder1f7ba332012-08-10 13:12:07 -0700756 found = true;
757 break;
758 }
759 }
760 spin_unlock(&rbd_client_list_lock);
761
762 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700763}
764
765/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300766 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700767 */
768enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300769 Opt_queue_depth,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700770 Opt_last_int,
771 /* int args above */
772 Opt_last_string,
773 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700774 Opt_read_only,
775 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200776 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200777 Opt_exclusive,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300778 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700779};
780
Alex Elder43ae4702012-07-03 16:01:18 -0500781static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300782 {Opt_queue_depth, "queue_depth=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700783 /* int args above */
784 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500785 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700786 {Opt_read_only, "ro"}, /* Alternate spelling */
787 {Opt_read_write, "read_write"},
788 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200789 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200790 {Opt_exclusive, "exclusive"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300791 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700792};
793
Alex Elder98571b52013-01-20 14:44:42 -0600794struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300795 int queue_depth;
Alex Elder98571b52013-01-20 14:44:42 -0600796 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200797 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200798 bool exclusive;
Alex Elder98571b52013-01-20 14:44:42 -0600799};
800
Ilya Dryomovb5584182015-06-23 16:21:19 +0300801#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Alex Elder98571b52013-01-20 14:44:42 -0600802#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200803#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200804#define RBD_EXCLUSIVE_DEFAULT false
Alex Elder98571b52013-01-20 14:44:42 -0600805
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700806static int parse_rbd_opts_token(char *c, void *private)
807{
Alex Elder43ae4702012-07-03 16:01:18 -0500808 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700809 substring_t argstr[MAX_OPT_ARGS];
810 int token, intval, ret;
811
Alex Elder43ae4702012-07-03 16:01:18 -0500812 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700813 if (token < Opt_last_int) {
814 ret = match_int(&argstr[0], &intval);
815 if (ret < 0) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300816 pr_err("bad mount option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700817 return ret;
818 }
819 dout("got int token %d val %d\n", token, intval);
820 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300821 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700822 } else {
823 dout("got token %d\n", token);
824 }
825
826 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300827 case Opt_queue_depth:
828 if (intval < 1) {
829 pr_err("queue_depth out of range\n");
830 return -EINVAL;
831 }
832 rbd_opts->queue_depth = intval;
833 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700834 case Opt_read_only:
835 rbd_opts->read_only = true;
836 break;
837 case Opt_read_write:
838 rbd_opts->read_only = false;
839 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200840 case Opt_lock_on_read:
841 rbd_opts->lock_on_read = true;
842 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200843 case Opt_exclusive:
844 rbd_opts->exclusive = true;
845 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700846 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300847 /* libceph prints "bad option" msg */
848 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700849 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300850
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700851 return 0;
852}
853
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800854static char* obj_op_name(enum obj_operation_type op_type)
855{
856 switch (op_type) {
857 case OBJ_OP_READ:
858 return "read";
859 case OBJ_OP_WRITE:
860 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800861 case OBJ_OP_DISCARD:
862 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800863 default:
864 return "???";
865 }
866}
867
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700868/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700869 * Get a ceph client with specific addr and configuration, if one does
Alex Elder7262cfc2013-05-16 15:04:20 -0500870 * not exist create it. Either way, ceph_opts is consumed by this
871 * function.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700872 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500873static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874{
Alex Elderf8c38922012-08-10 13:12:07 -0700875 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700876
Alex Eldercfbf6372013-05-31 17:40:45 -0500877 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
Alex Elder1f7ba332012-08-10 13:12:07 -0700878 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500879 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500880 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500881 else
Alex Elderf8c38922012-08-10 13:12:07 -0700882 rbdc = rbd_client_create(ceph_opts);
Alex Eldercfbf6372013-05-31 17:40:45 -0500883 mutex_unlock(&client_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884
Alex Elder9d3997f2012-10-25 23:34:42 -0500885 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700886}
887
888/*
889 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600890 *
Alex Elder432b8582012-01-29 13:57:44 -0600891 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892 */
893static void rbd_client_release(struct kref *kref)
894{
895 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
896
Alex Elder37206ee2013-02-20 17:32:08 -0600897 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500898 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700899 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500900 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901
902 ceph_destroy_client(rbdc->client);
903 kfree(rbdc);
904}
905
906/*
907 * Drop reference to ceph client node. If it's not referenced anymore, release
908 * it.
909 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500910static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911{
Alex Elderc53d5892012-10-25 23:34:42 -0500912 if (rbdc)
913 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700914}
915
Alex Eldera30b71b2012-07-10 20:30:11 -0500916static bool rbd_image_format_valid(u32 image_format)
917{
918 return image_format == 1 || image_format == 2;
919}
920
Alex Elder8e94af82012-07-25 09:32:40 -0500921static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
922{
Alex Elder103a1502012-08-02 11:29:45 -0500923 size_t size;
924 u32 snap_count;
925
926 /* The header has to start with the magic rbd header text */
927 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
928 return false;
929
Alex Elderdb2388b2012-10-20 22:17:27 -0500930 /* The bio layer requires at least sector-sized I/O */
931
932 if (ondisk->options.order < SECTOR_SHIFT)
933 return false;
934
935 /* If we use u64 in a few spots we may be able to loosen this */
936
937 if (ondisk->options.order > 8 * sizeof (int) - 1)
938 return false;
939
Alex Elder103a1502012-08-02 11:29:45 -0500940 /*
941 * The size of a snapshot header has to fit in a size_t, and
942 * that limits the number of snapshots.
943 */
944 snap_count = le32_to_cpu(ondisk->snap_count);
945 size = SIZE_MAX - sizeof (struct ceph_snap_context);
946 if (snap_count > size / sizeof (__le64))
947 return false;
948
949 /*
950 * Not only that, but the size of the entire the snapshot
951 * header must also be representable in a size_t.
952 */
953 size -= snap_count * sizeof (__le64);
954 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
955 return false;
956
957 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500958}
959
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700960/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100961 * returns the size of an object in the image
962 */
963static u32 rbd_obj_bytes(struct rbd_image_header *header)
964{
965 return 1U << header->obj_order;
966}
967
Ilya Dryomov263423f2017-01-25 18:16:22 +0100968static void rbd_init_layout(struct rbd_device *rbd_dev)
969{
970 if (rbd_dev->header.stripe_unit == 0 ||
971 rbd_dev->header.stripe_count == 0) {
972 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
973 rbd_dev->header.stripe_count = 1;
974 }
975
976 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
977 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
978 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +0100979 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
980 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +0100981 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
982}
983
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100984/*
Alex Elderbb23e372013-05-06 09:51:29 -0500985 * Fill an rbd image header with information from the given format 1
986 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700987 */
Alex Elder662518b2013-05-06 09:51:29 -0500988static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -0500989 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700990{
Alex Elder662518b2013-05-06 09:51:29 -0500991 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -0500992 bool first_time = header->object_prefix == NULL;
993 struct ceph_snap_context *snapc;
994 char *object_prefix = NULL;
995 char *snap_names = NULL;
996 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500997 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -0500998 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -0500999 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000
Alex Elderbb23e372013-05-06 09:51:29 -05001001 /* Allocate this now to avoid having to handle failure below */
1002
1003 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001004 object_prefix = kstrndup(ondisk->object_prefix,
1005 sizeof(ondisk->object_prefix),
1006 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001007 if (!object_prefix)
1008 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001009 }
1010
1011 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001012
Alex Elder103a1502012-08-02 11:29:45 -05001013 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001014 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1015 if (!snapc)
1016 goto out_err;
1017 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001018 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001019 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001020 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1021
Alex Elderbb23e372013-05-06 09:51:29 -05001022 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001023
Alex Elderbb23e372013-05-06 09:51:29 -05001024 if (snap_names_len > (u64)SIZE_MAX)
1025 goto out_2big;
1026 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1027 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001028 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001029
1030 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001031 snap_sizes = kmalloc_array(snap_count,
1032 sizeof(*header->snap_sizes),
1033 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001034 if (!snap_sizes)
1035 goto out_err;
1036
Alex Elderf785cc12012-08-23 23:22:06 -05001037 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001038 * Copy the names, and fill in each snapshot's id
1039 * and size.
1040 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001041 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001042 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001043 * snap_names_len bytes beyond the end of the
1044 * snapshot id array, this memcpy() is safe.
1045 */
Alex Elderbb23e372013-05-06 09:51:29 -05001046 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1047 snaps = ondisk->snaps;
1048 for (i = 0; i < snap_count; i++) {
1049 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1050 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1051 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001052 }
Alex Elder849b4262012-07-09 21:04:24 -05001053
Alex Elderbb23e372013-05-06 09:51:29 -05001054 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001055
Alex Elderbb23e372013-05-06 09:51:29 -05001056 if (first_time) {
1057 header->object_prefix = object_prefix;
1058 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001059 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001060 } else {
1061 ceph_put_snap_context(header->snapc);
1062 kfree(header->snap_names);
1063 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001064 }
1065
1066 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001067
Alex Elderf84344f2012-08-31 17:29:51 -05001068 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001069 header->snapc = snapc;
1070 header->snap_names = snap_names;
1071 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001072
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001073 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001074out_2big:
1075 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001076out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001077 kfree(snap_sizes);
1078 kfree(snap_names);
1079 ceph_put_snap_context(snapc);
1080 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001081
Alex Elderbb23e372013-05-06 09:51:29 -05001082 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001083}
1084
Alex Elder9682fc62013-04-30 00:44:33 -05001085static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1086{
1087 const char *snap_name;
1088
1089 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1090
1091 /* Skip over names until we find the one we are looking for */
1092
1093 snap_name = rbd_dev->header.snap_names;
1094 while (which--)
1095 snap_name += strlen(snap_name) + 1;
1096
1097 return kstrdup(snap_name, GFP_KERNEL);
1098}
1099
Alex Elder30d1cff2013-05-01 12:43:03 -05001100/*
1101 * Snapshot id comparison function for use with qsort()/bsearch().
1102 * Note that result is for snapshots in *descending* order.
1103 */
1104static int snapid_compare_reverse(const void *s1, const void *s2)
1105{
1106 u64 snap_id1 = *(u64 *)s1;
1107 u64 snap_id2 = *(u64 *)s2;
1108
1109 if (snap_id1 < snap_id2)
1110 return 1;
1111 return snap_id1 == snap_id2 ? 0 : -1;
1112}
1113
1114/*
1115 * Search a snapshot context to see if the given snapshot id is
1116 * present.
1117 *
1118 * Returns the position of the snapshot id in the array if it's found,
1119 * or BAD_SNAP_INDEX otherwise.
1120 *
1121 * Note: The snapshot array is in kept sorted (by the osd) in
1122 * reverse order, highest snapshot id first.
1123 */
Alex Elder9682fc62013-04-30 00:44:33 -05001124static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1125{
1126 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001127 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001128
Alex Elder30d1cff2013-05-01 12:43:03 -05001129 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1130 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001131
Alex Elder30d1cff2013-05-01 12:43:03 -05001132 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001133}
1134
Alex Elder2ad3d712013-04-30 00:44:33 -05001135static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1136 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001137{
1138 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001139 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001140
1141 which = rbd_dev_snap_index(rbd_dev, snap_id);
1142 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001143 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001144
Josh Durginda6a6b62013-09-04 17:57:31 -07001145 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1146 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001147}
1148
Alex Elder9e15b772012-10-30 19:40:33 -05001149static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1150{
Alex Elder9e15b772012-10-30 19:40:33 -05001151 if (snap_id == CEPH_NOSNAP)
1152 return RBD_SNAP_HEAD_NAME;
1153
Alex Elder54cac612013-04-30 00:44:33 -05001154 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1155 if (rbd_dev->image_format == 1)
1156 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001157
Alex Elder54cac612013-04-30 00:44:33 -05001158 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001159}
1160
Alex Elder2ad3d712013-04-30 00:44:33 -05001161static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1162 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001163{
Alex Elder2ad3d712013-04-30 00:44:33 -05001164 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1165 if (snap_id == CEPH_NOSNAP) {
1166 *snap_size = rbd_dev->header.image_size;
1167 } else if (rbd_dev->image_format == 1) {
1168 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001169
Alex Elder2ad3d712013-04-30 00:44:33 -05001170 which = rbd_dev_snap_index(rbd_dev, snap_id);
1171 if (which == BAD_SNAP_INDEX)
1172 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001173
Alex Elder2ad3d712013-04-30 00:44:33 -05001174 *snap_size = rbd_dev->header.snap_sizes[which];
1175 } else {
1176 u64 size = 0;
1177 int ret;
1178
1179 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1180 if (ret)
1181 return ret;
1182
1183 *snap_size = size;
1184 }
1185 return 0;
1186}
1187
1188static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1189 u64 *snap_features)
1190{
1191 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1192 if (snap_id == CEPH_NOSNAP) {
1193 *snap_features = rbd_dev->header.features;
1194 } else if (rbd_dev->image_format == 1) {
1195 *snap_features = 0; /* No features for format 1 */
1196 } else {
1197 u64 features = 0;
1198 int ret;
1199
1200 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1201 if (ret)
1202 return ret;
1203
1204 *snap_features = features;
1205 }
1206 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001207}
1208
Alex Elderd1cf5782013-04-27 09:59:30 -05001209static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001210{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001211 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001212 u64 size = 0;
1213 u64 features = 0;
1214 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001215
Alex Elder2ad3d712013-04-30 00:44:33 -05001216 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1217 if (ret)
1218 return ret;
1219 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1220 if (ret)
1221 return ret;
1222
1223 rbd_dev->mapping.size = size;
1224 rbd_dev->mapping.features = features;
1225
Alex Elder8b0241f2013-04-25 23:15:08 -05001226 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001227}
1228
Alex Elderd1cf5782013-04-27 09:59:30 -05001229static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1230{
1231 rbd_dev->mapping.size = 0;
1232 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001233}
1234
Alex Elder65ccfe22012-08-09 10:33:26 -07001235static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1236{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001237 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001238
Alex Elder65ccfe22012-08-09 10:33:26 -07001239 return offset & (segment_size - 1);
1240}
1241
1242static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1243 u64 offset, u64 length)
1244{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001245 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Alex Elder65ccfe22012-08-09 10:33:26 -07001246
1247 offset &= segment_size - 1;
1248
Alex Elderaafb2302012-09-06 16:00:54 -05001249 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001250 if (offset + length > segment_size)
1251 length = segment_size - offset;
1252
1253 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001254}
1255
1256/*
1257 * bio helpers
1258 */
1259
1260static void bio_chain_put(struct bio *chain)
1261{
1262 struct bio *tmp;
1263
1264 while (chain) {
1265 tmp = chain;
1266 chain = chain->bi_next;
1267 bio_put(tmp);
1268 }
1269}
1270
1271/*
1272 * zeros a bio chain, starting at specific offset
1273 */
1274static void zero_bio_chain(struct bio *chain, int start_ofs)
1275{
Kent Overstreet79886132013-11-23 17:19:00 -08001276 struct bio_vec bv;
1277 struct bvec_iter iter;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001278 unsigned long flags;
1279 void *buf;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001280 int pos = 0;
1281
1282 while (chain) {
Kent Overstreet79886132013-11-23 17:19:00 -08001283 bio_for_each_segment(bv, chain, iter) {
1284 if (pos + bv.bv_len > start_ofs) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001285 int remainder = max(start_ofs - pos, 0);
Kent Overstreet79886132013-11-23 17:19:00 -08001286 buf = bvec_kmap_irq(&bv, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001287 memset(buf + remainder, 0,
Kent Overstreet79886132013-11-23 17:19:00 -08001288 bv.bv_len - remainder);
1289 flush_dcache_page(bv.bv_page);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +02001290 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001291 }
Kent Overstreet79886132013-11-23 17:19:00 -08001292 pos += bv.bv_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001293 }
1294
1295 chain = chain->bi_next;
1296 }
1297}
1298
1299/*
Alex Elderb9434c52013-04-19 15:34:50 -05001300 * similar to zero_bio_chain(), zeros data defined by a page array,
1301 * starting at the given byte offset from the start of the array and
1302 * continuing up to the given end offset. The pages array is
1303 * assumed to be big enough to hold all bytes up to the end.
1304 */
1305static void zero_pages(struct page **pages, u64 offset, u64 end)
1306{
1307 struct page **page = &pages[offset >> PAGE_SHIFT];
1308
1309 rbd_assert(end > offset);
1310 rbd_assert(end - offset <= (u64)SIZE_MAX);
1311 while (offset < end) {
1312 size_t page_offset;
1313 size_t length;
1314 unsigned long flags;
1315 void *kaddr;
1316
Geert Uytterhoeven491205a2013-05-13 20:35:37 -05001317 page_offset = offset & ~PAGE_MASK;
1318 length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
Alex Elderb9434c52013-04-19 15:34:50 -05001319 local_irq_save(flags);
1320 kaddr = kmap_atomic(*page);
1321 memset(kaddr + page_offset, 0, length);
Alex Eldere2156052013-05-22 20:54:25 -05001322 flush_dcache_page(*page);
Alex Elderb9434c52013-04-19 15:34:50 -05001323 kunmap_atomic(kaddr);
1324 local_irq_restore(flags);
1325
1326 offset += length;
1327 page++;
1328 }
1329}
1330
1331/*
Alex Elderf7760da2012-10-20 22:17:27 -05001332 * Clone a portion of a bio, starting at the given byte offset
1333 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001334 */
Alex Elderf7760da2012-10-20 22:17:27 -05001335static struct bio *bio_clone_range(struct bio *bio_src,
1336 unsigned int offset,
1337 unsigned int len,
1338 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001339{
Alex Elderf7760da2012-10-20 22:17:27 -05001340 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001341
NeilBrownf856dc32017-06-18 14:38:58 +10001342 bio = bio_clone_fast(bio_src, gfpmask, rbd_bio_clone);
Alex Elderf7760da2012-10-20 22:17:27 -05001343 if (!bio)
1344 return NULL; /* ENOMEM */
1345
Kent Overstreet5341a6272013-08-07 14:31:11 -07001346 bio_advance(bio, offset);
Kent Overstreet4f024f32013-10-11 15:44:27 -07001347 bio->bi_iter.bi_size = len;
Alex Elder542582f2012-08-09 10:33:25 -07001348
Alex Elderf7760da2012-10-20 22:17:27 -05001349 return bio;
1350}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001351
Alex Elderf7760da2012-10-20 22:17:27 -05001352/*
1353 * Clone a portion of a bio chain, starting at the given byte offset
1354 * into the first bio in the source chain and continuing for the
1355 * number of bytes indicated. The result is another bio chain of
1356 * exactly the given length, or a null pointer on error.
1357 *
1358 * The bio_src and offset parameters are both in-out. On entry they
1359 * refer to the first source bio and the offset into that bio where
1360 * the start of data to be cloned is located.
1361 *
1362 * On return, bio_src is updated to refer to the bio in the source
1363 * chain that contains first un-cloned byte, and *offset will
1364 * contain the offset of that byte within that bio.
1365 */
1366static struct bio *bio_chain_clone_range(struct bio **bio_src,
1367 unsigned int *offset,
1368 unsigned int len,
1369 gfp_t gfpmask)
1370{
1371 struct bio *bi = *bio_src;
1372 unsigned int off = *offset;
1373 struct bio *chain = NULL;
1374 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001375
Alex Elderf7760da2012-10-20 22:17:27 -05001376 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001377
Kent Overstreet4f024f32013-10-11 15:44:27 -07001378 if (!bi || off >= bi->bi_iter.bi_size || !len)
Alex Elderf7760da2012-10-20 22:17:27 -05001379 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001380
Alex Elderf7760da2012-10-20 22:17:27 -05001381 end = &chain;
1382 while (len) {
1383 unsigned int bi_size;
1384 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001385
Alex Elderf5400b72012-11-01 10:17:15 -05001386 if (!bi) {
1387 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001388 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001389 }
Kent Overstreet4f024f32013-10-11 15:44:27 -07001390 bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
Alex Elderf7760da2012-10-20 22:17:27 -05001391 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1392 if (!bio)
1393 goto out_err; /* ENOMEM */
1394
1395 *end = bio;
1396 end = &bio->bi_next;
1397
1398 off += bi_size;
Kent Overstreet4f024f32013-10-11 15:44:27 -07001399 if (off == bi->bi_iter.bi_size) {
Alex Elderf7760da2012-10-20 22:17:27 -05001400 bi = bi->bi_next;
1401 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001402 }
Alex Elderf7760da2012-10-20 22:17:27 -05001403 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001404 }
Alex Elderf7760da2012-10-20 22:17:27 -05001405 *bio_src = bi;
1406 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001407
Alex Elderf7760da2012-10-20 22:17:27 -05001408 return chain;
1409out_err:
1410 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001411
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001412 return NULL;
1413}
1414
Alex Elder926f9b32013-02-11 12:33:24 -06001415/*
1416 * The default/initial value for all object request flags is 0. For
1417 * each flag, once its value is set to 1 it is never reset to 0
1418 * again.
1419 */
Alex Elder6365d332013-02-11 12:33:24 -06001420static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1421{
1422 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001423 struct rbd_device *rbd_dev;
1424
Alex Elder57acbaa2013-02-11 12:33:24 -06001425 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001426 rbd_warn(rbd_dev, "obj_request %p already marked img_data",
Alex Elder6365d332013-02-11 12:33:24 -06001427 obj_request);
1428 }
1429}
1430
1431static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1432{
1433 smp_mb();
1434 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1435}
1436
Alex Elder57acbaa2013-02-11 12:33:24 -06001437static void obj_request_done_set(struct rbd_obj_request *obj_request)
1438{
1439 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1440 struct rbd_device *rbd_dev = NULL;
1441
1442 if (obj_request_img_data_test(obj_request))
1443 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001444 rbd_warn(rbd_dev, "obj_request %p already marked done",
Alex Elder57acbaa2013-02-11 12:33:24 -06001445 obj_request);
1446 }
1447}
1448
1449static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1450{
1451 smp_mb();
1452 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1453}
1454
Alex Elder5679c592013-02-11 12:33:24 -06001455/*
1456 * This sets the KNOWN flag after (possibly) setting the EXISTS
1457 * flag. The latter is set based on the "exists" value provided.
1458 *
1459 * Note that for our purposes once an object exists it never goes
1460 * away again. It's possible that the response from two existence
1461 * checks are separated by the creation of the target object, and
1462 * the first ("doesn't exist") response arrives *after* the second
1463 * ("does exist"). In that case we ignore the second one.
1464 */
1465static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1466 bool exists)
1467{
1468 if (exists)
1469 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1470 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1471 smp_mb();
1472}
1473
1474static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1475{
1476 smp_mb();
1477 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1478}
1479
1480static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1481{
1482 smp_mb();
1483 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1484}
1485
Ilya Dryomov96385562014-06-10 13:53:29 +04001486static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1487{
1488 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1489
1490 return obj_request->img_offset <
1491 round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1492}
1493
Alex Elderbf0d5f502012-11-22 00:00:08 -06001494static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1495{
Alex Elder37206ee2013-02-20 17:32:08 -06001496 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001497 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001498 kref_get(&obj_request->kref);
1499}
1500
1501static void rbd_obj_request_destroy(struct kref *kref);
1502static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1503{
1504 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001505 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001506 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001507 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1508}
1509
Alex Elder0f2d5be2014-04-26 14:21:44 +04001510static void rbd_img_request_get(struct rbd_img_request *img_request)
1511{
1512 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001513 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001514 kref_get(&img_request->kref);
1515}
1516
Alex Eldere93f3152013-05-08 22:50:04 -05001517static bool img_request_child_test(struct rbd_img_request *img_request);
1518static void rbd_parent_request_destroy(struct kref *kref);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001519static void rbd_img_request_destroy(struct kref *kref);
1520static void rbd_img_request_put(struct rbd_img_request *img_request)
1521{
1522 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001523 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001524 kref_read(&img_request->kref));
Alex Eldere93f3152013-05-08 22:50:04 -05001525 if (img_request_child_test(img_request))
1526 kref_put(&img_request->kref, rbd_parent_request_destroy);
1527 else
1528 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001529}
1530
1531static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1532 struct rbd_obj_request *obj_request)
1533{
Alex Elder25dcf952013-01-25 17:08:55 -06001534 rbd_assert(obj_request->img_request == NULL);
1535
Alex Elderb155e862013-04-15 14:50:37 -05001536 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001537 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001538 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001539 rbd_assert(!obj_request_img_data_test(obj_request));
1540 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001541 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001542 img_request->obj_request_count++;
1543 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001544 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1545 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001546}
1547
1548static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1549 struct rbd_obj_request *obj_request)
1550{
1551 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001552
Alex Elder37206ee2013-02-20 17:32:08 -06001553 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1554 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001555 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001556 rbd_assert(img_request->obj_request_count > 0);
1557 img_request->obj_request_count--;
1558 rbd_assert(obj_request->which == img_request->obj_request_count);
1559 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001560 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001561 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001562 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001563 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001564 rbd_obj_request_put(obj_request);
1565}
1566
1567static bool obj_request_type_valid(enum obj_request_type type)
1568{
1569 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001570 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001571 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001572 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001573 return true;
1574 default:
1575 return false;
1576 }
1577}
1578
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001579static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
1580
Ilya Dryomov980917f2016-09-12 18:59:42 +02001581static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001582{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001583 struct ceph_osd_request *osd_req = obj_request->osd_req;
1584
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001585 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
1586 obj_request, obj_request->object_no, obj_request->offset,
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001587 obj_request->length, osd_req);
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001588 if (obj_request_img_data_test(obj_request)) {
1589 WARN_ON(obj_request->callback != rbd_img_obj_callback);
1590 rbd_img_request_get(obj_request->img_request);
1591 }
Ilya Dryomov980917f2016-09-12 18:59:42 +02001592 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001593}
1594
1595static void rbd_img_request_complete(struct rbd_img_request *img_request)
1596{
Alex Elder55f27e02013-04-10 12:34:25 -05001597
Alex Elder37206ee2013-02-20 17:32:08 -06001598 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001599
1600 /*
1601 * If no error occurred, compute the aggregate transfer
1602 * count for the image request. We could instead use
1603 * atomic64_cmpxchg() to update it as each object request
1604 * completes; not clear which way is better off hand.
1605 */
1606 if (!img_request->result) {
1607 struct rbd_obj_request *obj_request;
1608 u64 xferred = 0;
1609
1610 for_each_obj_request(img_request, obj_request)
1611 xferred += obj_request->xferred;
1612 img_request->xferred = xferred;
1613 }
1614
Alex Elderbf0d5f502012-11-22 00:00:08 -06001615 if (img_request->callback)
1616 img_request->callback(img_request);
1617 else
1618 rbd_img_request_put(img_request);
1619}
1620
Alex Elder0c425242013-02-08 09:55:49 -06001621/*
1622 * The default/initial value for all image request flags is 0. Each
1623 * is conditionally set to 1 at image request initialization time
1624 * and currently never change thereafter.
1625 */
1626static void img_request_write_set(struct rbd_img_request *img_request)
1627{
1628 set_bit(IMG_REQ_WRITE, &img_request->flags);
1629 smp_mb();
1630}
1631
1632static bool img_request_write_test(struct rbd_img_request *img_request)
1633{
1634 smp_mb();
1635 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1636}
1637
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001638/*
1639 * Set the discard flag when the img_request is an discard request
1640 */
1641static void img_request_discard_set(struct rbd_img_request *img_request)
1642{
1643 set_bit(IMG_REQ_DISCARD, &img_request->flags);
1644 smp_mb();
1645}
1646
1647static bool img_request_discard_test(struct rbd_img_request *img_request)
1648{
1649 smp_mb();
1650 return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
1651}
1652
Alex Elder9849e982013-01-24 16:13:36 -06001653static void img_request_child_set(struct rbd_img_request *img_request)
1654{
1655 set_bit(IMG_REQ_CHILD, &img_request->flags);
1656 smp_mb();
1657}
1658
Alex Eldere93f3152013-05-08 22:50:04 -05001659static void img_request_child_clear(struct rbd_img_request *img_request)
1660{
1661 clear_bit(IMG_REQ_CHILD, &img_request->flags);
1662 smp_mb();
1663}
1664
Alex Elder9849e982013-01-24 16:13:36 -06001665static bool img_request_child_test(struct rbd_img_request *img_request)
1666{
1667 smp_mb();
1668 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1669}
1670
Alex Elderd0b2e942013-01-24 16:13:36 -06001671static void img_request_layered_set(struct rbd_img_request *img_request)
1672{
1673 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1674 smp_mb();
1675}
1676
Alex Eldera2acd002013-05-08 22:50:04 -05001677static void img_request_layered_clear(struct rbd_img_request *img_request)
1678{
1679 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1680 smp_mb();
1681}
1682
Alex Elderd0b2e942013-01-24 16:13:36 -06001683static bool img_request_layered_test(struct rbd_img_request *img_request)
1684{
1685 smp_mb();
1686 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1687}
1688
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001689static enum obj_operation_type
1690rbd_img_request_op_type(struct rbd_img_request *img_request)
1691{
1692 if (img_request_write_test(img_request))
1693 return OBJ_OP_WRITE;
1694 else if (img_request_discard_test(img_request))
1695 return OBJ_OP_DISCARD;
1696 else
1697 return OBJ_OP_READ;
1698}
1699
Alex Elder6e2a4502013-03-27 09:16:30 -05001700static void
1701rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1702{
Alex Elderb9434c52013-04-19 15:34:50 -05001703 u64 xferred = obj_request->xferred;
1704 u64 length = obj_request->length;
1705
Alex Elder6e2a4502013-03-27 09:16:30 -05001706 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1707 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001708 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001709 /*
Josh Durgin17c1cc12013-08-26 17:55:38 -07001710 * ENOENT means a hole in the image. We zero-fill the entire
1711 * length of the request. A short read also implies zero-fill
1712 * to the end of the request. An error requires the whole
1713 * length of the request to be reported finished with an error
1714 * to the block layer. In each case we update the xferred
1715 * count to indicate the whole request was satisfied.
Alex Elder6e2a4502013-03-27 09:16:30 -05001716 */
Alex Elderb9434c52013-04-19 15:34:50 -05001717 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001718 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001719 if (obj_request->type == OBJ_REQUEST_BIO)
1720 zero_bio_chain(obj_request->bio_list, 0);
1721 else
1722 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001723 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001724 } else if (xferred < length && !obj_request->result) {
1725 if (obj_request->type == OBJ_REQUEST_BIO)
1726 zero_bio_chain(obj_request->bio_list, xferred);
1727 else
1728 zero_pages(obj_request->pages, xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001729 }
Josh Durgin17c1cc12013-08-26 17:55:38 -07001730 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001731 obj_request_done_set(obj_request);
1732}
1733
Alex Elderbf0d5f502012-11-22 00:00:08 -06001734static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1735{
Alex Elder37206ee2013-02-20 17:32:08 -06001736 dout("%s: obj %p cb %p\n", __func__, obj_request,
1737 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001738 if (obj_request->callback)
1739 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001740 else
1741 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001742}
1743
Ilya Dryomov0dcc6852016-09-26 15:43:52 +02001744static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err)
1745{
1746 obj_request->result = err;
1747 obj_request->xferred = 0;
1748 /*
1749 * kludge - mirror rbd_obj_request_submit() to match a put in
1750 * rbd_img_obj_callback()
1751 */
1752 if (obj_request_img_data_test(obj_request)) {
1753 WARN_ON(obj_request->callback != rbd_img_obj_callback);
1754 rbd_img_request_get(obj_request->img_request);
1755 }
1756 obj_request_done_set(obj_request);
1757 rbd_obj_request_complete(obj_request);
1758}
1759
Alex Elderc47f9372013-02-26 14:23:07 -06001760static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001761{
Alex Elder57acbaa2013-02-11 12:33:24 -06001762 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001763 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001764 bool layered = false;
1765
1766 if (obj_request_img_data_test(obj_request)) {
1767 img_request = obj_request->img_request;
1768 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001769 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001770 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001771
1772 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1773 obj_request, img_request, obj_request->result,
1774 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001775 if (layered && obj_request->result == -ENOENT &&
1776 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001777 rbd_img_parent_read(obj_request);
1778 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001779 rbd_img_obj_request_read_callback(obj_request);
1780 else
1781 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001782}
1783
Alex Elderc47f9372013-02-26 14:23:07 -06001784static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001785{
Sage Weil1b83bef2013-02-25 16:11:12 -08001786 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1787 obj_request->result, obj_request->length);
1788 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001789 * There is no such thing as a successful short write. Set
1790 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001791 */
1792 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001793 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001794}
1795
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001796static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
1797{
1798 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1799 obj_request->result, obj_request->length);
1800 /*
1801 * There is no such thing as a successful short discard. Set
1802 * it to our originally-requested length.
1803 */
1804 obj_request->xferred = obj_request->length;
Josh Durgind0265de2014-04-07 16:54:10 -07001805 /* discarding a non-existent object is not a problem */
1806 if (obj_request->result == -ENOENT)
1807 obj_request->result = 0;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001808 obj_request_done_set(obj_request);
1809}
1810
Alex Elderfbfab532013-02-08 09:55:48 -06001811/*
1812 * For a simple stat call there's nothing to do. We'll do more if
1813 * this is part of a write sequence for a layered image.
1814 */
Alex Elderc47f9372013-02-26 14:23:07 -06001815static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001816{
Alex Elder37206ee2013-02-20 17:32:08 -06001817 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001818 obj_request_done_set(obj_request);
1819}
1820
Ilya Dryomov27617132015-07-16 17:36:11 +03001821static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
1822{
1823 dout("%s: obj %p\n", __func__, obj_request);
1824
1825 if (obj_request_img_data_test(obj_request))
1826 rbd_osd_copyup_callback(obj_request);
1827 else
1828 obj_request_done_set(obj_request);
1829}
1830
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001831static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001832{
1833 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001834 u16 opcode;
1835
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001836 dout("%s: osd_req %p\n", __func__, osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001837 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001838 if (obj_request_img_data_test(obj_request)) {
1839 rbd_assert(obj_request->img_request);
1840 rbd_assert(obj_request->which != BAD_WHICH);
1841 } else {
1842 rbd_assert(obj_request->which == BAD_WHICH);
1843 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001844
Sage Weil1b83bef2013-02-25 16:11:12 -08001845 if (osd_req->r_result < 0)
1846 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001847
Alex Elderc47f9372013-02-26 14:23:07 -06001848 /*
1849 * We support a 64-bit length, but ultimately it has to be
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01001850 * passed to the block layer, which just supports a 32-bit
1851 * length field.
Alex Elderc47f9372013-02-26 14:23:07 -06001852 */
Yan, Zheng7665d852016-01-07 16:48:57 +08001853 obj_request->xferred = osd_req->r_ops[0].outdata_len;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001854 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001855
Alex Elder79528732013-04-03 21:32:51 -05001856 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001857 switch (opcode) {
1858 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001859 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001860 break;
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001861 case CEPH_OSD_OP_SETALLOCHINT:
Ilya Dryomove30b7572015-10-07 17:27:17 +02001862 rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1863 osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001864 /* fall through */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001865 case CEPH_OSD_OP_WRITE:
Ilya Dryomove30b7572015-10-07 17:27:17 +02001866 case CEPH_OSD_OP_WRITEFULL:
Alex Elderc47f9372013-02-26 14:23:07 -06001867 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001868 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001869 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001870 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001871 break;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001872 case CEPH_OSD_OP_DELETE:
1873 case CEPH_OSD_OP_TRUNCATE:
1874 case CEPH_OSD_OP_ZERO:
1875 rbd_osd_discard_callback(obj_request);
1876 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001877 case CEPH_OSD_OP_CALL:
Ilya Dryomov27617132015-07-16 17:36:11 +03001878 rbd_osd_call_callback(obj_request);
1879 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001880 default:
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001881 rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d",
1882 obj_request->object_no, opcode);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001883 break;
1884 }
1885
Alex Elder07741302013-02-05 23:41:50 -06001886 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001887 rbd_obj_request_complete(obj_request);
1888}
1889
Alex Elder9d4df012013-04-19 15:34:50 -05001890static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001891{
Alex Elder8c042b02013-04-03 01:28:58 -05001892 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001893
Ilya Dryomov7c848832016-09-15 17:56:39 +02001894 rbd_assert(obj_request_img_data_test(obj_request));
1895 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001896}
1897
1898static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1899{
Alex Elder9d4df012013-04-19 15:34:50 -05001900 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001901
Deepa Dinamani1134e092017-05-08 15:59:19 -07001902 ktime_get_real_ts(&osd_req->r_mtime);
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001903 osd_req->r_data_offset = obj_request->offset;
Alex Elder430c28c2013-04-03 21:32:51 -05001904}
1905
Ilya Dryomovbc812072017-01-25 18:16:23 +01001906static struct ceph_osd_request *
1907__rbd_osd_req_create(struct rbd_device *rbd_dev,
1908 struct ceph_snap_context *snapc,
1909 int num_ops, unsigned int flags,
1910 struct rbd_obj_request *obj_request)
1911{
1912 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1913 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001914 const char *name_format = rbd_dev->image_format == 1 ?
1915 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001916
1917 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1918 if (!req)
1919 return NULL;
1920
1921 req->r_flags = flags;
1922 req->r_callback = rbd_osd_req_callback;
1923 req->r_priv = obj_request;
1924
1925 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001926 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1927 rbd_dev->header.object_prefix, obj_request->object_no))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001928 goto err_req;
1929
1930 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1931 goto err_req;
1932
1933 return req;
1934
1935err_req:
1936 ceph_osdc_put_request(req);
1937 return NULL;
1938}
1939
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001940/*
1941 * Create an osd request. A read request has one osd op (read).
1942 * A write request has either one (watch) or two (hint+write) osd ops.
1943 * (All rbd data writes are prefixed with an allocation hint op, but
1944 * technically osd watch is a write request, hence this distinction.)
1945 */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001946static struct ceph_osd_request *rbd_osd_req_create(
1947 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001948 enum obj_operation_type op_type,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02001949 unsigned int num_ops,
Alex Elder430c28c2013-04-03 21:32:51 -05001950 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001951{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001952 struct ceph_snap_context *snapc = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001953
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001954 if (obj_request_img_data_test(obj_request) &&
1955 (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
Alex Elder6365d332013-02-11 12:33:24 -06001956 struct rbd_img_request *img_request = obj_request->img_request;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001957 if (op_type == OBJ_OP_WRITE) {
1958 rbd_assert(img_request_write_test(img_request));
1959 } else {
1960 rbd_assert(img_request_discard_test(img_request));
1961 }
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001962 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001963 }
1964
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001965 rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02001966
Ilya Dryomovbc812072017-01-25 18:16:23 +01001967 return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
1968 (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
Ilya Dryomov54ea0042017-02-11 18:48:41 +01001969 CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001970}
1971
Alex Elder0eefd472013-04-19 15:34:50 -05001972/*
Josh Durgind3246fb2014-04-07 16:49:21 -07001973 * Create a copyup osd request based on the information in the object
1974 * request supplied. A copyup request has two or three osd ops, a
1975 * copyup method call, potentially a hint op, and a write or truncate
1976 * or zero op.
Alex Elder0eefd472013-04-19 15:34:50 -05001977 */
1978static struct ceph_osd_request *
1979rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1980{
1981 struct rbd_img_request *img_request;
Josh Durgind3246fb2014-04-07 16:49:21 -07001982 int num_osd_ops = 3;
Alex Elder0eefd472013-04-19 15:34:50 -05001983
1984 rbd_assert(obj_request_img_data_test(obj_request));
1985 img_request = obj_request->img_request;
1986 rbd_assert(img_request);
Josh Durgind3246fb2014-04-07 16:49:21 -07001987 rbd_assert(img_request_write_test(img_request) ||
1988 img_request_discard_test(img_request));
Alex Elder0eefd472013-04-19 15:34:50 -05001989
Josh Durgind3246fb2014-04-07 16:49:21 -07001990 if (img_request_discard_test(img_request))
1991 num_osd_ops = 2;
1992
Ilya Dryomovbc812072017-01-25 18:16:23 +01001993 return __rbd_osd_req_create(img_request->rbd_dev,
1994 img_request->snapc, num_osd_ops,
Ilya Dryomov54ea0042017-02-11 18:48:41 +01001995 CEPH_OSD_FLAG_WRITE, obj_request);
Alex Elder0eefd472013-04-19 15:34:50 -05001996}
1997
Alex Elderbf0d5f502012-11-22 00:00:08 -06001998static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1999{
2000 ceph_osdc_put_request(osd_req);
2001}
2002
Ilya Dryomov6c696d82017-01-25 18:16:23 +01002003static struct rbd_obj_request *
2004rbd_obj_request_create(enum obj_request_type type)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002005{
2006 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002007
2008 rbd_assert(obj_request_type_valid(type));
2009
Ilya Dryomov5a60e872015-06-24 17:24:33 +03002010 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01002011 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05002012 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05002013
Alex Elderbf0d5f502012-11-22 00:00:08 -06002014 obj_request->which = BAD_WHICH;
2015 obj_request->type = type;
2016 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06002017 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002018 kref_init(&obj_request->kref);
2019
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002020 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002021 return obj_request;
2022}
2023
2024static void rbd_obj_request_destroy(struct kref *kref)
2025{
2026 struct rbd_obj_request *obj_request;
2027
2028 obj_request = container_of(kref, struct rbd_obj_request, kref);
2029
Alex Elder37206ee2013-02-20 17:32:08 -06002030 dout("%s: obj %p\n", __func__, obj_request);
2031
Alex Elderbf0d5f502012-11-22 00:00:08 -06002032 rbd_assert(obj_request->img_request == NULL);
2033 rbd_assert(obj_request->which == BAD_WHICH);
2034
2035 if (obj_request->osd_req)
2036 rbd_osd_req_destroy(obj_request->osd_req);
2037
2038 rbd_assert(obj_request_type_valid(obj_request->type));
2039 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06002040 case OBJ_REQUEST_NODATA:
2041 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06002042 case OBJ_REQUEST_BIO:
2043 if (obj_request->bio_list)
2044 bio_chain_put(obj_request->bio_list);
2045 break;
Alex Elder788e2df2013-01-17 12:25:27 -06002046 case OBJ_REQUEST_PAGES:
Ilya Dryomov04dc9232016-09-15 18:05:16 +02002047 /* img_data requests don't own their page array */
2048 if (obj_request->pages &&
2049 !obj_request_img_data_test(obj_request))
Alex Elder788e2df2013-01-17 12:25:27 -06002050 ceph_release_page_vector(obj_request->pages,
2051 obj_request->page_count);
2052 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002053 }
2054
Alex Elder868311b2013-05-01 12:43:03 -05002055 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002056}
2057
Alex Elderfb65d2282013-05-08 22:50:04 -05002058/* It's OK to call this for a device with no parent */
2059
2060static void rbd_spec_put(struct rbd_spec *spec);
2061static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2062{
2063 rbd_dev_remove_parent(rbd_dev);
2064 rbd_spec_put(rbd_dev->parent_spec);
2065 rbd_dev->parent_spec = NULL;
2066 rbd_dev->parent_overlap = 0;
2067}
2068
Alex Elderbf0d5f502012-11-22 00:00:08 -06002069/*
Alex Eldera2acd002013-05-08 22:50:04 -05002070 * Parent image reference counting is used to determine when an
2071 * image's parent fields can be safely torn down--after there are no
2072 * more in-flight requests to the parent image. When the last
2073 * reference is dropped, cleaning them up is safe.
2074 */
2075static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2076{
2077 int counter;
2078
2079 if (!rbd_dev->parent_spec)
2080 return;
2081
2082 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2083 if (counter > 0)
2084 return;
2085
2086 /* Last reference; clean up parent data structures */
2087
2088 if (!counter)
2089 rbd_dev_unparent(rbd_dev);
2090 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04002091 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05002092}
2093
2094/*
2095 * If an image has a non-zero parent overlap, get a reference to its
2096 * parent.
2097 *
2098 * Returns true if the rbd device has a parent with a non-zero
2099 * overlap and a reference for it was successfully taken, or
2100 * false otherwise.
2101 */
2102static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2103{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002104 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05002105
2106 if (!rbd_dev->parent_spec)
2107 return false;
2108
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002109 down_read(&rbd_dev->header_rwsem);
2110 if (rbd_dev->parent_overlap)
2111 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2112 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05002113
2114 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04002115 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05002116
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002117 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05002118}
2119
Alex Elderbf0d5f502012-11-22 00:00:08 -06002120/*
2121 * Caller is responsible for filling in the list of object requests
2122 * that comprises the image request, and the Linux request pointer
2123 * (if there is one).
2124 */
Alex Eldercc344fa2013-02-19 12:25:56 -06002125static struct rbd_img_request *rbd_img_request_create(
2126 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06002127 u64 offset, u64 length,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002128 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07002129 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002130{
2131 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002132
Ilya Dryomov7a716aa2014-08-05 11:25:54 +04002133 img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002134 if (!img_request)
2135 return NULL;
2136
Alex Elderbf0d5f502012-11-22 00:00:08 -06002137 img_request->rq = NULL;
2138 img_request->rbd_dev = rbd_dev;
2139 img_request->offset = offset;
2140 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06002141 img_request->flags = 0;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002142 if (op_type == OBJ_OP_DISCARD) {
2143 img_request_discard_set(img_request);
2144 img_request->snapc = snapc;
2145 } else if (op_type == OBJ_OP_WRITE) {
Alex Elder0c425242013-02-08 09:55:49 -06002146 img_request_write_set(img_request);
Josh Durgin4e752f02014-04-08 11:12:11 -07002147 img_request->snapc = snapc;
Alex Elder0c425242013-02-08 09:55:49 -06002148 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002149 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06002150 }
Alex Eldera2acd002013-05-08 22:50:04 -05002151 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06002152 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002153 spin_lock_init(&img_request->completion_lock);
2154 img_request->next_completion = 0;
2155 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06002156 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002157 img_request->obj_request_count = 0;
2158 INIT_LIST_HEAD(&img_request->obj_requests);
2159 kref_init(&img_request->kref);
2160
Alex Elder37206ee2013-02-20 17:32:08 -06002161 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002162 obj_op_name(op_type), offset, length, img_request);
Alex Elder37206ee2013-02-20 17:32:08 -06002163
Alex Elderbf0d5f502012-11-22 00:00:08 -06002164 return img_request;
2165}
2166
2167static void rbd_img_request_destroy(struct kref *kref)
2168{
2169 struct rbd_img_request *img_request;
2170 struct rbd_obj_request *obj_request;
2171 struct rbd_obj_request *next_obj_request;
2172
2173 img_request = container_of(kref, struct rbd_img_request, kref);
2174
Alex Elder37206ee2013-02-20 17:32:08 -06002175 dout("%s: img %p\n", __func__, img_request);
2176
Alex Elderbf0d5f502012-11-22 00:00:08 -06002177 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2178 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06002179 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002180
Alex Eldera2acd002013-05-08 22:50:04 -05002181 if (img_request_layered_test(img_request)) {
2182 img_request_layered_clear(img_request);
2183 rbd_dev_parent_put(img_request->rbd_dev);
2184 }
2185
Josh Durginbef95452014-04-04 17:47:52 -07002186 if (img_request_write_test(img_request) ||
2187 img_request_discard_test(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05002188 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002189
Alex Elder1c2a9df2013-05-01 12:43:03 -05002190 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002191}
2192
Alex Eldere93f3152013-05-08 22:50:04 -05002193static struct rbd_img_request *rbd_parent_request_create(
2194 struct rbd_obj_request *obj_request,
2195 u64 img_offset, u64 length)
2196{
2197 struct rbd_img_request *parent_request;
2198 struct rbd_device *rbd_dev;
2199
2200 rbd_assert(obj_request->img_request);
2201 rbd_dev = obj_request->img_request->rbd_dev;
2202
Josh Durgin4e752f02014-04-08 11:12:11 -07002203 parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002204 length, OBJ_OP_READ, NULL);
Alex Eldere93f3152013-05-08 22:50:04 -05002205 if (!parent_request)
2206 return NULL;
2207
2208 img_request_child_set(parent_request);
2209 rbd_obj_request_get(obj_request);
2210 parent_request->obj_request = obj_request;
2211
2212 return parent_request;
2213}
2214
2215static void rbd_parent_request_destroy(struct kref *kref)
2216{
2217 struct rbd_img_request *parent_request;
2218 struct rbd_obj_request *orig_request;
2219
2220 parent_request = container_of(kref, struct rbd_img_request, kref);
2221 orig_request = parent_request->obj_request;
2222
2223 parent_request->obj_request = NULL;
2224 rbd_obj_request_put(orig_request);
2225 img_request_child_clear(parent_request);
2226
2227 rbd_img_request_destroy(kref);
2228}
2229
Alex Elder12178572013-02-08 09:55:49 -06002230static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2231{
Alex Elder6365d332013-02-11 12:33:24 -06002232 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06002233 unsigned int xferred;
2234 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002235 bool more;
Alex Elder12178572013-02-08 09:55:49 -06002236
Alex Elder6365d332013-02-11 12:33:24 -06002237 rbd_assert(obj_request_img_data_test(obj_request));
2238 img_request = obj_request->img_request;
2239
Alex Elder12178572013-02-08 09:55:49 -06002240 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2241 xferred = (unsigned int)obj_request->xferred;
2242 result = obj_request->result;
2243 if (result) {
2244 struct rbd_device *rbd_dev = img_request->rbd_dev;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002245 enum obj_operation_type op_type;
2246
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002247 if (img_request_discard_test(img_request))
2248 op_type = OBJ_OP_DISCARD;
2249 else if (img_request_write_test(img_request))
2250 op_type = OBJ_OP_WRITE;
2251 else
2252 op_type = OBJ_OP_READ;
Alex Elder12178572013-02-08 09:55:49 -06002253
Ilya Dryomov9584d502014-07-11 12:11:20 +04002254 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002255 obj_op_name(op_type), obj_request->length,
2256 obj_request->img_offset, obj_request->offset);
Ilya Dryomov9584d502014-07-11 12:11:20 +04002257 rbd_warn(rbd_dev, " result %d xferred %x",
Alex Elder12178572013-02-08 09:55:49 -06002258 result, xferred);
2259 if (!img_request->result)
2260 img_request->result = result;
Ilya Dryomov082a75d2015-04-25 15:56:15 +03002261 /*
2262 * Need to end I/O on the entire obj_request worth of
2263 * bytes in case of error.
2264 */
2265 xferred = obj_request->length;
Alex Elder12178572013-02-08 09:55:49 -06002266 }
2267
Alex Elder8b3e1a52013-01-24 16:13:36 -06002268 if (img_request_child_test(img_request)) {
2269 rbd_assert(img_request->obj_request != NULL);
2270 more = obj_request->which < img_request->obj_request_count - 1;
2271 } else {
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02002272 blk_status_t status = errno_to_blk_status(result);
2273
Alex Elder8b3e1a52013-01-24 16:13:36 -06002274 rbd_assert(img_request->rq != NULL);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01002275
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02002276 more = blk_update_request(img_request->rq, status, xferred);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01002277 if (!more)
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02002278 __blk_mq_end_request(img_request->rq, status);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002279 }
2280
2281 return more;
Alex Elder12178572013-02-08 09:55:49 -06002282}
2283
Alex Elder21692382013-04-05 01:27:12 -05002284static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2285{
2286 struct rbd_img_request *img_request;
2287 u32 which = obj_request->which;
2288 bool more = true;
2289
Alex Elder6365d332013-02-11 12:33:24 -06002290 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05002291 img_request = obj_request->img_request;
2292
2293 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2294 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05002295 rbd_assert(img_request->obj_request_count > 0);
2296 rbd_assert(which != BAD_WHICH);
2297 rbd_assert(which < img_request->obj_request_count);
Alex Elder21692382013-04-05 01:27:12 -05002298
2299 spin_lock_irq(&img_request->completion_lock);
2300 if (which != img_request->next_completion)
2301 goto out;
2302
2303 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05002304 rbd_assert(more);
2305 rbd_assert(which < img_request->obj_request_count);
2306
2307 if (!obj_request_done_test(obj_request))
2308 break;
Alex Elder12178572013-02-08 09:55:49 -06002309 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002310 which++;
2311 }
2312
2313 rbd_assert(more ^ (which == img_request->obj_request_count));
2314 img_request->next_completion = which;
2315out:
2316 spin_unlock_irq(&img_request->completion_lock);
Alex Elder0f2d5be2014-04-26 14:21:44 +04002317 rbd_img_request_put(img_request);
Alex Elder21692382013-04-05 01:27:12 -05002318
2319 if (!more)
2320 rbd_img_request_complete(img_request);
2321}
2322
Alex Elderf1a47392013-04-19 15:34:50 -05002323/*
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002324 * Add individual osd ops to the given ceph_osd_request and prepare
2325 * them for submission. num_ops is the current number of
2326 * osd operations already to the object request.
2327 */
2328static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
2329 struct ceph_osd_request *osd_request,
2330 enum obj_operation_type op_type,
2331 unsigned int num_ops)
2332{
2333 struct rbd_img_request *img_request = obj_request->img_request;
2334 struct rbd_device *rbd_dev = img_request->rbd_dev;
2335 u64 object_size = rbd_obj_bytes(&rbd_dev->header);
2336 u64 offset = obj_request->offset;
2337 u64 length = obj_request->length;
2338 u64 img_end;
2339 u16 opcode;
2340
2341 if (op_type == OBJ_OP_DISCARD) {
Josh Durgind3246fb2014-04-07 16:49:21 -07002342 if (!offset && length == object_size &&
2343 (!img_request_layered_test(img_request) ||
2344 !obj_request_overlaps_parent(obj_request))) {
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002345 opcode = CEPH_OSD_OP_DELETE;
2346 } else if ((offset + length == object_size)) {
2347 opcode = CEPH_OSD_OP_TRUNCATE;
2348 } else {
2349 down_read(&rbd_dev->header_rwsem);
2350 img_end = rbd_dev->header.image_size;
2351 up_read(&rbd_dev->header_rwsem);
2352
2353 if (obj_request->img_offset + length == img_end)
2354 opcode = CEPH_OSD_OP_TRUNCATE;
2355 else
2356 opcode = CEPH_OSD_OP_ZERO;
2357 }
2358 } else if (op_type == OBJ_OP_WRITE) {
Ilya Dryomove30b7572015-10-07 17:27:17 +02002359 if (!offset && length == object_size)
2360 opcode = CEPH_OSD_OP_WRITEFULL;
2361 else
2362 opcode = CEPH_OSD_OP_WRITE;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002363 osd_req_op_alloc_hint_init(osd_request, num_ops,
2364 object_size, object_size);
2365 num_ops++;
2366 } else {
2367 opcode = CEPH_OSD_OP_READ;
2368 }
2369
Ilya Dryomov7e868b62014-11-21 22:16:43 +03002370 if (opcode == CEPH_OSD_OP_DELETE)
Yan, Zheng144cba12015-04-27 11:09:54 +08002371 osd_req_op_init(osd_request, num_ops, opcode, 0);
Ilya Dryomov7e868b62014-11-21 22:16:43 +03002372 else
2373 osd_req_op_extent_init(osd_request, num_ops, opcode,
2374 offset, length, 0, 0);
2375
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002376 if (obj_request->type == OBJ_REQUEST_BIO)
2377 osd_req_op_extent_osd_data_bio(osd_request, num_ops,
2378 obj_request->bio_list, length);
2379 else if (obj_request->type == OBJ_REQUEST_PAGES)
2380 osd_req_op_extent_osd_data_pages(osd_request, num_ops,
2381 obj_request->pages, length,
2382 offset & ~PAGE_MASK, false, false);
2383
2384 /* Discards are also writes */
2385 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
2386 rbd_osd_req_format_write(obj_request);
2387 else
2388 rbd_osd_req_format_read(obj_request);
2389}
2390
2391/*
Alex Elderf1a47392013-04-19 15:34:50 -05002392 * Split up an image request into one or more object requests, each
2393 * to a different object. The "type" parameter indicates whether
2394 * "data_desc" is the pointer to the head of a list of bio
2395 * structures, or the base of a page array. In either case this
2396 * function assumes data_desc describes memory sufficient to hold
2397 * all data described by the image request.
2398 */
2399static int rbd_img_request_fill(struct rbd_img_request *img_request,
2400 enum obj_request_type type,
2401 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002402{
2403 struct rbd_device *rbd_dev = img_request->rbd_dev;
2404 struct rbd_obj_request *obj_request = NULL;
2405 struct rbd_obj_request *next_obj_request;
Jingoo Hana1580732013-08-09 13:04:35 +09002406 struct bio *bio_list = NULL;
Alex Elderf1a47392013-04-19 15:34:50 -05002407 unsigned int bio_offset = 0;
Jingoo Hana1580732013-08-09 13:04:35 +09002408 struct page **pages = NULL;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002409 enum obj_operation_type op_type;
Alex Elder7da22d22013-01-24 16:13:36 -06002410 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002411 u64 resid;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002412
Alex Elderf1a47392013-04-19 15:34:50 -05002413 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2414 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06002415
Alex Elder7da22d22013-01-24 16:13:36 -06002416 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002417 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06002418 rbd_assert(resid > 0);
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002419 op_type = rbd_img_request_op_type(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05002420
2421 if (type == OBJ_REQUEST_BIO) {
2422 bio_list = data_desc;
Kent Overstreet4f024f32013-10-11 15:44:27 -07002423 rbd_assert(img_offset ==
2424 bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002425 } else if (type == OBJ_REQUEST_PAGES) {
Alex Elderf1a47392013-04-19 15:34:50 -05002426 pages = data_desc;
2427 }
2428
Alex Elderbf0d5f502012-11-22 00:00:08 -06002429 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05002430 struct ceph_osd_request *osd_req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01002431 u64 object_no = img_offset >> rbd_dev->header.obj_order;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002432 u64 offset = rbd_segment_offset(rbd_dev, img_offset);
2433 u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002434
Ilya Dryomov6c696d82017-01-25 18:16:23 +01002435 obj_request = rbd_obj_request_create(type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002436 if (!obj_request)
2437 goto out_unwind;
Ilya Dryomov62054da2014-03-04 11:57:17 +02002438
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01002439 obj_request->object_no = object_no;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002440 obj_request->offset = offset;
2441 obj_request->length = length;
2442
Josh Durgin03507db2013-08-27 14:45:46 -07002443 /*
2444 * set obj_request->img_request before creating the
2445 * osd_request so that it gets the right snapc
2446 */
2447 rbd_img_obj_request_add(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002448
Alex Elderf1a47392013-04-19 15:34:50 -05002449 if (type == OBJ_REQUEST_BIO) {
2450 unsigned int clone_size;
2451
2452 rbd_assert(length <= (u64)UINT_MAX);
2453 clone_size = (unsigned int)length;
2454 obj_request->bio_list =
2455 bio_chain_clone_range(&bio_list,
2456 &bio_offset,
2457 clone_size,
David Disseldorp2224d872016-04-05 11:13:39 +02002458 GFP_NOIO);
Alex Elderf1a47392013-04-19 15:34:50 -05002459 if (!obj_request->bio_list)
Ilya Dryomov62054da2014-03-04 11:57:17 +02002460 goto out_unwind;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002461 } else if (type == OBJ_REQUEST_PAGES) {
Alex Elderf1a47392013-04-19 15:34:50 -05002462 unsigned int page_count;
2463
2464 obj_request->pages = pages;
2465 page_count = (u32)calc_pages_for(offset, length);
2466 obj_request->page_count = page_count;
2467 if ((offset + length) & ~PAGE_MASK)
2468 page_count--; /* more on last page */
2469 pages += page_count;
2470 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002471
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002472 osd_req = rbd_osd_req_create(rbd_dev, op_type,
2473 (op_type == OBJ_OP_WRITE) ? 2 : 1,
2474 obj_request);
Alex Elder2fa12322013-04-05 01:27:12 -05002475 if (!osd_req)
Ilya Dryomov62054da2014-03-04 11:57:17 +02002476 goto out_unwind;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002477
Alex Elder2fa12322013-04-05 01:27:12 -05002478 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05002479 obj_request->callback = rbd_img_obj_callback;
Alex Elder7da22d22013-01-24 16:13:36 -06002480 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002481
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002482 rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
2483
Alex Elder7da22d22013-01-24 16:13:36 -06002484 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002485 resid -= length;
2486 }
2487
2488 return 0;
2489
Alex Elderbf0d5f502012-11-22 00:00:08 -06002490out_unwind:
2491 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
Ilya Dryomov42dd0372014-03-04 11:57:17 +02002492 rbd_img_obj_request_del(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002493
2494 return -ENOMEM;
2495}
2496
Alex Elder3d7efd12013-04-19 15:34:50 -05002497static void
Ilya Dryomov27617132015-07-16 17:36:11 +03002498rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
Alex Elder0eefd472013-04-19 15:34:50 -05002499{
2500 struct rbd_img_request *img_request;
2501 struct rbd_device *rbd_dev;
Alex Elderebda6402013-05-10 16:29:22 -05002502 struct page **pages;
Alex Elder0eefd472013-04-19 15:34:50 -05002503 u32 page_count;
2504
Ilya Dryomov27617132015-07-16 17:36:11 +03002505 dout("%s: obj %p\n", __func__, obj_request);
2506
Josh Durgind3246fb2014-04-07 16:49:21 -07002507 rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2508 obj_request->type == OBJ_REQUEST_NODATA);
Alex Elder0eefd472013-04-19 15:34:50 -05002509 rbd_assert(obj_request_img_data_test(obj_request));
2510 img_request = obj_request->img_request;
2511 rbd_assert(img_request);
2512
2513 rbd_dev = img_request->rbd_dev;
2514 rbd_assert(rbd_dev);
Alex Elder0eefd472013-04-19 15:34:50 -05002515
Alex Elderebda6402013-05-10 16:29:22 -05002516 pages = obj_request->copyup_pages;
2517 rbd_assert(pages != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002518 obj_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002519 page_count = obj_request->copyup_page_count;
2520 rbd_assert(page_count);
2521 obj_request->copyup_page_count = 0;
2522 ceph_release_page_vector(pages, page_count);
Alex Elder0eefd472013-04-19 15:34:50 -05002523
2524 /*
2525 * We want the transfer count to reflect the size of the
2526 * original write request. There is no such thing as a
2527 * successful short write, so if the request was successful
2528 * we can just set it to the originally-requested length.
2529 */
2530 if (!obj_request->result)
2531 obj_request->xferred = obj_request->length;
2532
Ilya Dryomov27617132015-07-16 17:36:11 +03002533 obj_request_done_set(obj_request);
Alex Elder0eefd472013-04-19 15:34:50 -05002534}
2535
2536static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002537rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2538{
2539 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002540 struct ceph_osd_request *osd_req;
Alex Elder0eefd472013-04-19 15:34:50 -05002541 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002542 struct page **pages;
Josh Durgind3246fb2014-04-07 16:49:21 -07002543 enum obj_operation_type op_type;
Alex Elderebda6402013-05-10 16:29:22 -05002544 u32 page_count;
Alex Elderbbea1c12013-05-06 17:40:33 -05002545 int img_result;
Alex Elderebda6402013-05-10 16:29:22 -05002546 u64 parent_length;
Alex Elder3d7efd12013-04-19 15:34:50 -05002547
2548 rbd_assert(img_request_child_test(img_request));
2549
2550 /* First get what we need from the image request */
2551
2552 pages = img_request->copyup_pages;
2553 rbd_assert(pages != NULL);
2554 img_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002555 page_count = img_request->copyup_page_count;
2556 rbd_assert(page_count);
2557 img_request->copyup_page_count = 0;
Alex Elder3d7efd12013-04-19 15:34:50 -05002558
2559 orig_request = img_request->obj_request;
2560 rbd_assert(orig_request != NULL);
Alex Elderb91f09f2013-05-10 16:29:22 -05002561 rbd_assert(obj_request_type_valid(orig_request->type));
Alex Elderbbea1c12013-05-06 17:40:33 -05002562 img_result = img_request->result;
Alex Elderebda6402013-05-10 16:29:22 -05002563 parent_length = img_request->length;
Ilya Dryomovfa355112016-09-16 15:20:42 +02002564 rbd_assert(img_result || parent_length == img_request->xferred);
Alex Elder3d7efd12013-04-19 15:34:50 -05002565 rbd_img_request_put(img_request);
2566
Alex Elder91c6feb2013-05-06 17:40:32 -05002567 rbd_assert(orig_request->img_request);
2568 rbd_dev = orig_request->img_request->rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002569 rbd_assert(rbd_dev);
Alex Elder3d7efd12013-04-19 15:34:50 -05002570
Alex Elderbbea1c12013-05-06 17:40:33 -05002571 /*
2572 * If the overlap has become 0 (most likely because the
2573 * image has been flattened) we need to free the pages
2574 * and re-submit the original write request.
2575 */
2576 if (!rbd_dev->parent_overlap) {
Alex Elderbbea1c12013-05-06 17:40:33 -05002577 ceph_release_page_vector(pages, page_count);
Ilya Dryomov980917f2016-09-12 18:59:42 +02002578 rbd_obj_request_submit(orig_request);
2579 return;
Alex Elderbbea1c12013-05-06 17:40:33 -05002580 }
2581
2582 if (img_result)
Alex Elder0eefd472013-04-19 15:34:50 -05002583 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002584
Alex Elder8785b1d2013-05-09 10:08:49 -05002585 /*
2586 * The original osd request is of no use to use any more.
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002587 * We need a new one that can hold the three ops in a copyup
Alex Elder8785b1d2013-05-09 10:08:49 -05002588 * request. Allocate the new copyup osd request for the
2589 * original request, and release the old one.
2590 */
Alex Elderbbea1c12013-05-06 17:40:33 -05002591 img_result = -ENOMEM;
Alex Elder0eefd472013-04-19 15:34:50 -05002592 osd_req = rbd_osd_req_create_copyup(orig_request);
2593 if (!osd_req)
2594 goto out_err;
Alex Elder8785b1d2013-05-09 10:08:49 -05002595 rbd_osd_req_destroy(orig_request->osd_req);
Alex Elder0eefd472013-04-19 15:34:50 -05002596 orig_request->osd_req = osd_req;
2597 orig_request->copyup_pages = pages;
Alex Elderebda6402013-05-10 16:29:22 -05002598 orig_request->copyup_page_count = page_count;
Alex Elder3d7efd12013-04-19 15:34:50 -05002599
Alex Elder0eefd472013-04-19 15:34:50 -05002600 /* Initialize the copyup op */
2601
2602 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
Alex Elderebda6402013-05-10 16:29:22 -05002603 osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
Alex Elder0eefd472013-04-19 15:34:50 -05002604 false, false);
2605
Josh Durgind3246fb2014-04-07 16:49:21 -07002606 /* Add the other op(s) */
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002607
Josh Durgind3246fb2014-04-07 16:49:21 -07002608 op_type = rbd_img_request_op_type(orig_request->img_request);
2609 rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
Alex Elder0eefd472013-04-19 15:34:50 -05002610
2611 /* All set, send it off. */
2612
Ilya Dryomov980917f2016-09-12 18:59:42 +02002613 rbd_obj_request_submit(orig_request);
2614 return;
Alex Elder0eefd472013-04-19 15:34:50 -05002615
Alex Elder0eefd472013-04-19 15:34:50 -05002616out_err:
Ilya Dryomovfa355112016-09-16 15:20:42 +02002617 ceph_release_page_vector(pages, page_count);
Ilya Dryomov0dcc6852016-09-26 15:43:52 +02002618 rbd_obj_request_error(orig_request, img_result);
Alex Elder3d7efd12013-04-19 15:34:50 -05002619}
2620
2621/*
2622 * Read from the parent image the range of data that covers the
2623 * entire target of the given object request. This is used for
2624 * satisfying a layered image write request when the target of an
2625 * object request from the image request does not exist.
2626 *
2627 * A page array big enough to hold the returned data is allocated
2628 * and supplied to rbd_img_request_fill() as the "data descriptor."
2629 * When the read completes, this page array will be transferred to
2630 * the original object request for the copyup operation.
2631 *
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002632 * If an error occurs, it is recorded as the result of the original
2633 * object request in rbd_img_obj_exists_callback().
Alex Elder3d7efd12013-04-19 15:34:50 -05002634 */
2635static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2636{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002637 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002638 struct rbd_img_request *parent_request = NULL;
Alex Elder3d7efd12013-04-19 15:34:50 -05002639 u64 img_offset;
2640 u64 length;
2641 struct page **pages = NULL;
2642 u32 page_count;
2643 int result;
2644
Alex Elder3d7efd12013-04-19 15:34:50 -05002645 rbd_assert(rbd_dev->parent != NULL);
2646
2647 /*
2648 * Determine the byte range covered by the object in the
2649 * child image to which the original request was to be sent.
2650 */
2651 img_offset = obj_request->img_offset - obj_request->offset;
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01002652 length = rbd_obj_bytes(&rbd_dev->header);
Alex Elder3d7efd12013-04-19 15:34:50 -05002653
2654 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002655 * There is no defined parent data beyond the parent
2656 * overlap, so limit what we read at that boundary if
2657 * necessary.
2658 */
2659 if (img_offset + length > rbd_dev->parent_overlap) {
2660 rbd_assert(img_offset < rbd_dev->parent_overlap);
2661 length = rbd_dev->parent_overlap - img_offset;
2662 }
2663
2664 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002665 * Allocate a page array big enough to receive the data read
2666 * from the parent.
2667 */
2668 page_count = (u32)calc_pages_for(0, length);
Ilya Dryomov1e37f2f2017-11-06 11:33:36 +01002669 pages = ceph_alloc_page_vector(page_count, GFP_NOIO);
Alex Elder3d7efd12013-04-19 15:34:50 -05002670 if (IS_ERR(pages)) {
2671 result = PTR_ERR(pages);
2672 pages = NULL;
2673 goto out_err;
2674 }
2675
2676 result = -ENOMEM;
Alex Eldere93f3152013-05-08 22:50:04 -05002677 parent_request = rbd_parent_request_create(obj_request,
2678 img_offset, length);
Alex Elder3d7efd12013-04-19 15:34:50 -05002679 if (!parent_request)
2680 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002681
2682 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2683 if (result)
2684 goto out_err;
Ilya Dryomov058aa992016-09-12 14:44:45 +02002685
Alex Elder3d7efd12013-04-19 15:34:50 -05002686 parent_request->copyup_pages = pages;
Alex Elderebda6402013-05-10 16:29:22 -05002687 parent_request->copyup_page_count = page_count;
Alex Elder3d7efd12013-04-19 15:34:50 -05002688 parent_request->callback = rbd_img_obj_parent_read_full_callback;
Ilya Dryomov058aa992016-09-12 14:44:45 +02002689
Alex Elder3d7efd12013-04-19 15:34:50 -05002690 result = rbd_img_request_submit(parent_request);
2691 if (!result)
2692 return 0;
2693
2694 parent_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002695 parent_request->copyup_page_count = 0;
Alex Elder3d7efd12013-04-19 15:34:50 -05002696 parent_request->obj_request = NULL;
2697 rbd_obj_request_put(obj_request);
2698out_err:
2699 if (pages)
2700 ceph_release_page_vector(pages, page_count);
2701 if (parent_request)
2702 rbd_img_request_put(parent_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002703 return result;
2704}
2705
Alex Elderc5b5ef62013-02-11 12:33:24 -06002706static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2707{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002708 struct rbd_obj_request *orig_request;
Alex Elder638f5ab2013-05-06 17:40:33 -05002709 struct rbd_device *rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002710 int result;
2711
2712 rbd_assert(!obj_request_img_data_test(obj_request));
2713
2714 /*
2715 * All we need from the object request is the original
2716 * request and the result of the STAT op. Grab those, then
2717 * we're done with the request.
2718 */
2719 orig_request = obj_request->obj_request;
2720 obj_request->obj_request = NULL;
Alex Elder912c3172013-05-13 20:35:38 -05002721 rbd_obj_request_put(orig_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002722 rbd_assert(orig_request);
2723 rbd_assert(orig_request->img_request);
2724
2725 result = obj_request->result;
2726 obj_request->result = 0;
2727
2728 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2729 obj_request, orig_request, result,
2730 obj_request->xferred, obj_request->length);
2731 rbd_obj_request_put(obj_request);
2732
Alex Elder638f5ab2013-05-06 17:40:33 -05002733 /*
2734 * If the overlap has become 0 (most likely because the
Ilya Dryomov980917f2016-09-12 18:59:42 +02002735 * image has been flattened) we need to re-submit the
2736 * original request.
Alex Elder638f5ab2013-05-06 17:40:33 -05002737 */
2738 rbd_dev = orig_request->img_request->rbd_dev;
2739 if (!rbd_dev->parent_overlap) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02002740 rbd_obj_request_submit(orig_request);
2741 return;
Alex Elder638f5ab2013-05-06 17:40:33 -05002742 }
Alex Elderc5b5ef62013-02-11 12:33:24 -06002743
2744 /*
2745 * Our only purpose here is to determine whether the object
2746 * exists, and we don't want to treat the non-existence as
2747 * an error. If something else comes back, transfer the
2748 * error to the original request and complete it now.
2749 */
2750 if (!result) {
2751 obj_request_existence_set(orig_request, true);
2752 } else if (result == -ENOENT) {
2753 obj_request_existence_set(orig_request, false);
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002754 } else {
2755 goto fail_orig_request;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002756 }
2757
2758 /*
2759 * Resubmit the original request now that we have recorded
2760 * whether the target object exists.
2761 */
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002762 result = rbd_img_obj_request_submit(orig_request);
2763 if (result)
2764 goto fail_orig_request;
2765
2766 return;
2767
2768fail_orig_request:
Ilya Dryomov0dcc6852016-09-26 15:43:52 +02002769 rbd_obj_request_error(orig_request, result);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002770}
2771
2772static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2773{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002774 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002775 struct rbd_obj_request *stat_request;
Ilya Dryomov710214e2016-09-15 17:53:32 +02002776 struct page **pages;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002777 u32 page_count;
2778 size_t size;
2779 int ret;
2780
Ilya Dryomov6c696d82017-01-25 18:16:23 +01002781 stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES);
Ilya Dryomov710214e2016-09-15 17:53:32 +02002782 if (!stat_request)
2783 return -ENOMEM;
2784
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01002785 stat_request->object_no = obj_request->object_no;
2786
Ilya Dryomov710214e2016-09-15 17:53:32 +02002787 stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2788 stat_request);
2789 if (!stat_request->osd_req) {
2790 ret = -ENOMEM;
2791 goto fail_stat_request;
2792 }
2793
Alex Elderc5b5ef62013-02-11 12:33:24 -06002794 /*
2795 * The response data for a STAT call consists of:
2796 * le64 length;
2797 * struct {
2798 * le32 tv_sec;
2799 * le32 tv_nsec;
2800 * } mtime;
2801 */
2802 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2803 page_count = (u32)calc_pages_for(0, size);
Ilya Dryomov1e37f2f2017-11-06 11:33:36 +01002804 pages = ceph_alloc_page_vector(page_count, GFP_NOIO);
Ilya Dryomov710214e2016-09-15 17:53:32 +02002805 if (IS_ERR(pages)) {
2806 ret = PTR_ERR(pages);
2807 goto fail_stat_request;
2808 }
Alex Elderc5b5ef62013-02-11 12:33:24 -06002809
Ilya Dryomov710214e2016-09-15 17:53:32 +02002810 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2811 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2812 false, false);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002813
2814 rbd_obj_request_get(obj_request);
2815 stat_request->obj_request = obj_request;
2816 stat_request->pages = pages;
2817 stat_request->page_count = page_count;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002818 stat_request->callback = rbd_img_obj_exists_callback;
2819
Ilya Dryomov980917f2016-09-12 18:59:42 +02002820 rbd_obj_request_submit(stat_request);
2821 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002822
Ilya Dryomov710214e2016-09-15 17:53:32 +02002823fail_stat_request:
2824 rbd_obj_request_put(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002825 return ret;
2826}
2827
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002828static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
Alex Elderb454e362013-04-19 15:34:50 -05002829{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002830 struct rbd_img_request *img_request = obj_request->img_request;
2831 struct rbd_device *rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002832
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002833 /* Reads */
Josh Durgin1c220882014-04-04 17:49:12 -07002834 if (!img_request_write_test(img_request) &&
2835 !img_request_discard_test(img_request))
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002836 return true;
Alex Elderb454e362013-04-19 15:34:50 -05002837
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002838 /* Non-layered writes */
2839 if (!img_request_layered_test(img_request))
2840 return true;
2841
2842 /*
2843 * Layered writes outside of the parent overlap range don't
2844 * share any data with the parent.
2845 */
2846 if (!obj_request_overlaps_parent(obj_request))
2847 return true;
2848
2849 /*
Guangliang Zhaoc622d222014-04-01 22:22:15 +08002850 * Entire-object layered writes - we will overwrite whatever
2851 * parent data there is anyway.
2852 */
2853 if (!obj_request->offset &&
2854 obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2855 return true;
2856
2857 /*
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002858 * If the object is known to already exist, its parent data has
2859 * already been copied.
2860 */
2861 if (obj_request_known_test(obj_request) &&
2862 obj_request_exists_test(obj_request))
2863 return true;
2864
2865 return false;
2866}
2867
2868static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2869{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002870 rbd_assert(obj_request_img_data_test(obj_request));
2871 rbd_assert(obj_request_type_valid(obj_request->type));
2872 rbd_assert(obj_request->img_request);
2873
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002874 if (img_obj_request_simple(obj_request)) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02002875 rbd_obj_request_submit(obj_request);
2876 return 0;
Alex Elderb454e362013-04-19 15:34:50 -05002877 }
2878
2879 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002880 * It's a layered write. The target object might exist but
2881 * we may not know that yet. If we know it doesn't exist,
2882 * start by reading the data for the full target object from
2883 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002884 */
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002885 if (obj_request_known_test(obj_request))
Alex Elder3d7efd12013-04-19 15:34:50 -05002886 return rbd_img_obj_parent_read_full(obj_request);
2887
2888 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002889
2890 return rbd_img_obj_exists_submit(obj_request);
2891}
2892
Alex Elderbf0d5f502012-11-22 00:00:08 -06002893static int rbd_img_request_submit(struct rbd_img_request *img_request)
2894{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002895 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002896 struct rbd_obj_request *next_obj_request;
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002897 int ret = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002898
Alex Elder37206ee2013-02-20 17:32:08 -06002899 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002900
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002901 rbd_img_request_get(img_request);
2902 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderb454e362013-04-19 15:34:50 -05002903 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002904 if (ret)
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002905 goto out_put_ireq;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002906 }
2907
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002908out_put_ireq:
2909 rbd_img_request_put(img_request);
2910 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002911}
2912
Alex Elder8b3e1a52013-01-24 16:13:36 -06002913static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2914{
2915 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002916 struct rbd_device *rbd_dev;
2917 u64 obj_end;
Alex Elder02c74fb2013-05-06 17:40:33 -05002918 u64 img_xferred;
2919 int img_result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002920
2921 rbd_assert(img_request_child_test(img_request));
2922
Alex Elder02c74fb2013-05-06 17:40:33 -05002923 /* First get what we need from the image request and release it */
2924
Alex Elder8b3e1a52013-01-24 16:13:36 -06002925 obj_request = img_request->obj_request;
Alex Elder02c74fb2013-05-06 17:40:33 -05002926 img_xferred = img_request->xferred;
2927 img_result = img_request->result;
2928 rbd_img_request_put(img_request);
2929
2930 /*
2931 * If the overlap has become 0 (most likely because the
2932 * image has been flattened) we need to re-submit the
2933 * original request.
2934 */
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002935 rbd_assert(obj_request);
2936 rbd_assert(obj_request->img_request);
Alex Elder02c74fb2013-05-06 17:40:33 -05002937 rbd_dev = obj_request->img_request->rbd_dev;
2938 if (!rbd_dev->parent_overlap) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02002939 rbd_obj_request_submit(obj_request);
2940 return;
Alex Elder02c74fb2013-05-06 17:40:33 -05002941 }
2942
2943 obj_request->result = img_result;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002944 if (obj_request->result)
2945 goto out;
2946
2947 /*
2948 * We need to zero anything beyond the parent overlap
2949 * boundary. Since rbd_img_obj_request_read_callback()
2950 * will zero anything beyond the end of a short read, an
2951 * easy way to do this is to pretend the data from the
2952 * parent came up short--ending at the overlap boundary.
2953 */
2954 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2955 obj_end = obj_request->img_offset + obj_request->length;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002956 if (obj_end > rbd_dev->parent_overlap) {
2957 u64 xferred = 0;
2958
2959 if (obj_request->img_offset < rbd_dev->parent_overlap)
2960 xferred = rbd_dev->parent_overlap -
2961 obj_request->img_offset;
2962
Alex Elder02c74fb2013-05-06 17:40:33 -05002963 obj_request->xferred = min(img_xferred, xferred);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002964 } else {
Alex Elder02c74fb2013-05-06 17:40:33 -05002965 obj_request->xferred = img_xferred;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002966 }
2967out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06002968 rbd_img_obj_request_read_callback(obj_request);
2969 rbd_obj_request_complete(obj_request);
2970}
2971
2972static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2973{
Alex Elder8b3e1a52013-01-24 16:13:36 -06002974 struct rbd_img_request *img_request;
2975 int result;
2976
2977 rbd_assert(obj_request_img_data_test(obj_request));
2978 rbd_assert(obj_request->img_request != NULL);
2979 rbd_assert(obj_request->result == (s32) -ENOENT);
Alex Elder5b2ab722013-05-06 17:40:33 -05002980 rbd_assert(obj_request_type_valid(obj_request->type));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002981
Alex Elder8b3e1a52013-01-24 16:13:36 -06002982 /* rbd_read_finish(obj_request, obj_request->length); */
Alex Eldere93f3152013-05-08 22:50:04 -05002983 img_request = rbd_parent_request_create(obj_request,
Alex Elder8b3e1a52013-01-24 16:13:36 -06002984 obj_request->img_offset,
Alex Eldere93f3152013-05-08 22:50:04 -05002985 obj_request->length);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002986 result = -ENOMEM;
2987 if (!img_request)
2988 goto out_err;
2989
Alex Elder5b2ab722013-05-06 17:40:33 -05002990 if (obj_request->type == OBJ_REQUEST_BIO)
2991 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2992 obj_request->bio_list);
2993 else
2994 result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
2995 obj_request->pages);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002996 if (result)
2997 goto out_err;
2998
2999 img_request->callback = rbd_img_parent_read_callback;
3000 result = rbd_img_request_submit(img_request);
3001 if (result)
3002 goto out_err;
3003
3004 return;
3005out_err:
3006 if (img_request)
3007 rbd_img_request_put(img_request);
3008 obj_request->result = result;
3009 obj_request->xferred = 0;
3010 obj_request_done_set(obj_request);
3011}
3012
Ilya Dryomoved95b212016-08-12 16:40:02 +02003013static const struct rbd_client_id rbd_empty_cid;
3014
3015static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3016 const struct rbd_client_id *rhs)
3017{
3018 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3019}
3020
3021static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3022{
3023 struct rbd_client_id cid;
3024
3025 mutex_lock(&rbd_dev->watch_mutex);
3026 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3027 cid.handle = rbd_dev->watch_cookie;
3028 mutex_unlock(&rbd_dev->watch_mutex);
3029 return cid;
3030}
3031
3032/*
3033 * lock_rwsem must be held for write
3034 */
3035static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3036 const struct rbd_client_id *cid)
3037{
3038 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3039 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3040 cid->gid, cid->handle);
3041 rbd_dev->owner_cid = *cid; /* struct */
3042}
3043
3044static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3045{
3046 mutex_lock(&rbd_dev->watch_mutex);
3047 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3048 mutex_unlock(&rbd_dev->watch_mutex);
3049}
3050
3051/*
3052 * lock_rwsem must be held for write
3053 */
3054static int rbd_lock(struct rbd_device *rbd_dev)
3055{
3056 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3057 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3058 char cookie[32];
3059 int ret;
3060
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003061 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3062 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02003063
3064 format_lock_cookie(rbd_dev, cookie);
3065 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3066 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3067 RBD_LOCK_TAG, "", 0);
3068 if (ret)
3069 return ret;
3070
3071 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003072 strcpy(rbd_dev->lock_cookie, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003073 rbd_set_owner_cid(rbd_dev, &cid);
3074 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3075 return 0;
3076}
3077
3078/*
3079 * lock_rwsem must be held for write
3080 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02003081static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003082{
3083 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003084 int ret;
3085
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003086 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3087 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02003088
Ilya Dryomoved95b212016-08-12 16:40:02 +02003089 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003090 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02003091 if (ret && ret != -ENOENT)
3092 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003093
Ilya Dryomovbbead742017-04-13 12:17:38 +02003094 /* treat errors as the image is unlocked */
3095 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003096 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02003097 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3098 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003099}
3100
3101static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3102 enum rbd_notify_op notify_op,
3103 struct page ***preply_pages,
3104 size_t *preply_len)
3105{
3106 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3107 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3108 int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
3109 char buf[buf_size];
3110 void *p = buf;
3111
3112 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3113
3114 /* encode *LockPayload NotifyMessage (op + ClientId) */
3115 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3116 ceph_encode_32(&p, notify_op);
3117 ceph_encode_64(&p, cid.gid);
3118 ceph_encode_64(&p, cid.handle);
3119
3120 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3121 &rbd_dev->header_oloc, buf, buf_size,
3122 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3123}
3124
3125static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3126 enum rbd_notify_op notify_op)
3127{
3128 struct page **reply_pages;
3129 size_t reply_len;
3130
3131 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3132 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3133}
3134
3135static void rbd_notify_acquired_lock(struct work_struct *work)
3136{
3137 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3138 acquired_lock_work);
3139
3140 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3141}
3142
3143static void rbd_notify_released_lock(struct work_struct *work)
3144{
3145 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3146 released_lock_work);
3147
3148 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3149}
3150
3151static int rbd_request_lock(struct rbd_device *rbd_dev)
3152{
3153 struct page **reply_pages;
3154 size_t reply_len;
3155 bool lock_owner_responded = false;
3156 int ret;
3157
3158 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3159
3160 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3161 &reply_pages, &reply_len);
3162 if (ret && ret != -ETIMEDOUT) {
3163 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3164 goto out;
3165 }
3166
3167 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3168 void *p = page_address(reply_pages[0]);
3169 void *const end = p + reply_len;
3170 u32 n;
3171
3172 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3173 while (n--) {
3174 u8 struct_v;
3175 u32 len;
3176
3177 ceph_decode_need(&p, end, 8 + 8, e_inval);
3178 p += 8 + 8; /* skip gid and cookie */
3179
3180 ceph_decode_32_safe(&p, end, len, e_inval);
3181 if (!len)
3182 continue;
3183
3184 if (lock_owner_responded) {
3185 rbd_warn(rbd_dev,
3186 "duplicate lock owners detected");
3187 ret = -EIO;
3188 goto out;
3189 }
3190
3191 lock_owner_responded = true;
3192 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3193 &struct_v, &len);
3194 if (ret) {
3195 rbd_warn(rbd_dev,
3196 "failed to decode ResponseMessage: %d",
3197 ret);
3198 goto e_inval;
3199 }
3200
3201 ret = ceph_decode_32(&p);
3202 }
3203 }
3204
3205 if (!lock_owner_responded) {
3206 rbd_warn(rbd_dev, "no lock owners detected");
3207 ret = -ETIMEDOUT;
3208 }
3209
3210out:
3211 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3212 return ret;
3213
3214e_inval:
3215 ret = -EINVAL;
3216 goto out;
3217}
3218
3219static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3220{
3221 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3222
3223 cancel_delayed_work(&rbd_dev->lock_dwork);
3224 if (wake_all)
3225 wake_up_all(&rbd_dev->lock_waitq);
3226 else
3227 wake_up(&rbd_dev->lock_waitq);
3228}
3229
3230static int get_lock_owner_info(struct rbd_device *rbd_dev,
3231 struct ceph_locker **lockers, u32 *num_lockers)
3232{
3233 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3234 u8 lock_type;
3235 char *lock_tag;
3236 int ret;
3237
3238 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3239
3240 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3241 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3242 &lock_type, &lock_tag, lockers, num_lockers);
3243 if (ret)
3244 return ret;
3245
3246 if (*num_lockers == 0) {
3247 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3248 goto out;
3249 }
3250
3251 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3252 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3253 lock_tag);
3254 ret = -EBUSY;
3255 goto out;
3256 }
3257
3258 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3259 rbd_warn(rbd_dev, "shared lock type detected");
3260 ret = -EBUSY;
3261 goto out;
3262 }
3263
3264 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3265 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3266 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3267 (*lockers)[0].id.cookie);
3268 ret = -EBUSY;
3269 goto out;
3270 }
3271
3272out:
3273 kfree(lock_tag);
3274 return ret;
3275}
3276
3277static int find_watcher(struct rbd_device *rbd_dev,
3278 const struct ceph_locker *locker)
3279{
3280 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3281 struct ceph_watch_item *watchers;
3282 u32 num_watchers;
3283 u64 cookie;
3284 int i;
3285 int ret;
3286
3287 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3288 &rbd_dev->header_oloc, &watchers,
3289 &num_watchers);
3290 if (ret)
3291 return ret;
3292
3293 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3294 for (i = 0; i < num_watchers; i++) {
3295 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3296 sizeof(locker->info.addr)) &&
3297 watchers[i].cookie == cookie) {
3298 struct rbd_client_id cid = {
3299 .gid = le64_to_cpu(watchers[i].name.num),
3300 .handle = cookie,
3301 };
3302
3303 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3304 rbd_dev, cid.gid, cid.handle);
3305 rbd_set_owner_cid(rbd_dev, &cid);
3306 ret = 1;
3307 goto out;
3308 }
3309 }
3310
3311 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3312 ret = 0;
3313out:
3314 kfree(watchers);
3315 return ret;
3316}
3317
3318/*
3319 * lock_rwsem must be held for write
3320 */
3321static int rbd_try_lock(struct rbd_device *rbd_dev)
3322{
3323 struct ceph_client *client = rbd_dev->rbd_client->client;
3324 struct ceph_locker *lockers;
3325 u32 num_lockers;
3326 int ret;
3327
3328 for (;;) {
3329 ret = rbd_lock(rbd_dev);
3330 if (ret != -EBUSY)
3331 return ret;
3332
3333 /* determine if the current lock holder is still alive */
3334 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3335 if (ret)
3336 return ret;
3337
3338 if (num_lockers == 0)
3339 goto again;
3340
3341 ret = find_watcher(rbd_dev, lockers);
3342 if (ret) {
3343 if (ret > 0)
3344 ret = 0; /* have to request lock */
3345 goto out;
3346 }
3347
3348 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3349 ENTITY_NAME(lockers[0].id.name));
3350
3351 ret = ceph_monc_blacklist_add(&client->monc,
3352 &lockers[0].info.addr);
3353 if (ret) {
3354 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3355 ENTITY_NAME(lockers[0].id.name), ret);
3356 goto out;
3357 }
3358
3359 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3360 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3361 lockers[0].id.cookie,
3362 &lockers[0].id.name);
3363 if (ret && ret != -ENOENT)
3364 goto out;
3365
3366again:
3367 ceph_free_lockers(lockers, num_lockers);
3368 }
3369
3370out:
3371 ceph_free_lockers(lockers, num_lockers);
3372 return ret;
3373}
3374
3375/*
3376 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3377 */
3378static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3379 int *pret)
3380{
3381 enum rbd_lock_state lock_state;
3382
3383 down_read(&rbd_dev->lock_rwsem);
3384 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3385 rbd_dev->lock_state);
3386 if (__rbd_is_lock_owner(rbd_dev)) {
3387 lock_state = rbd_dev->lock_state;
3388 up_read(&rbd_dev->lock_rwsem);
3389 return lock_state;
3390 }
3391
3392 up_read(&rbd_dev->lock_rwsem);
3393 down_write(&rbd_dev->lock_rwsem);
3394 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3395 rbd_dev->lock_state);
3396 if (!__rbd_is_lock_owner(rbd_dev)) {
3397 *pret = rbd_try_lock(rbd_dev);
3398 if (*pret)
3399 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3400 }
3401
3402 lock_state = rbd_dev->lock_state;
3403 up_write(&rbd_dev->lock_rwsem);
3404 return lock_state;
3405}
3406
3407static void rbd_acquire_lock(struct work_struct *work)
3408{
3409 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3410 struct rbd_device, lock_dwork);
3411 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003412 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003413
3414 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3415again:
3416 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3417 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3418 if (lock_state == RBD_LOCK_STATE_LOCKED)
3419 wake_requests(rbd_dev, true);
3420 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3421 rbd_dev, lock_state, ret);
3422 return;
3423 }
3424
3425 ret = rbd_request_lock(rbd_dev);
3426 if (ret == -ETIMEDOUT) {
3427 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003428 } else if (ret == -EROFS) {
3429 rbd_warn(rbd_dev, "peer will not release lock");
3430 /*
3431 * If this is rbd_add_acquire_lock(), we want to fail
3432 * immediately -- reuse BLACKLISTED flag. Otherwise we
3433 * want to block.
3434 */
3435 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3436 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3437 /* wake "rbd map --exclusive" process */
3438 wake_requests(rbd_dev, false);
3439 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003440 } else if (ret < 0) {
3441 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3442 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3443 RBD_RETRY_DELAY);
3444 } else {
3445 /*
3446 * lock owner acked, but resend if we don't see them
3447 * release the lock
3448 */
3449 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3450 rbd_dev);
3451 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3452 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3453 }
3454}
3455
3456/*
3457 * lock_rwsem must be held for write
3458 */
3459static bool rbd_release_lock(struct rbd_device *rbd_dev)
3460{
3461 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3462 rbd_dev->lock_state);
3463 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3464 return false;
3465
3466 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3467 downgrade_write(&rbd_dev->lock_rwsem);
3468 /*
3469 * Ensure that all in-flight IO is flushed.
3470 *
3471 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3472 * may be shared with other devices.
3473 */
3474 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3475 up_read(&rbd_dev->lock_rwsem);
3476
3477 down_write(&rbd_dev->lock_rwsem);
3478 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3479 rbd_dev->lock_state);
3480 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3481 return false;
3482
Ilya Dryomovbbead742017-04-13 12:17:38 +02003483 rbd_unlock(rbd_dev);
3484 /*
3485 * Give others a chance to grab the lock - we would re-acquire
3486 * almost immediately if we got new IO during ceph_osdc_sync()
3487 * otherwise. We need to ack our own notifications, so this
3488 * lock_dwork will be requeued from rbd_wait_state_locked()
3489 * after wake_requests() in rbd_handle_released_lock().
3490 */
3491 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003492 return true;
3493}
3494
3495static void rbd_release_lock_work(struct work_struct *work)
3496{
3497 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3498 unlock_work);
3499
3500 down_write(&rbd_dev->lock_rwsem);
3501 rbd_release_lock(rbd_dev);
3502 up_write(&rbd_dev->lock_rwsem);
3503}
3504
3505static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3506 void **p)
3507{
3508 struct rbd_client_id cid = { 0 };
3509
3510 if (struct_v >= 2) {
3511 cid.gid = ceph_decode_64(p);
3512 cid.handle = ceph_decode_64(p);
3513 }
3514
3515 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3516 cid.handle);
3517 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3518 down_write(&rbd_dev->lock_rwsem);
3519 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3520 /*
3521 * we already know that the remote client is
3522 * the owner
3523 */
3524 up_write(&rbd_dev->lock_rwsem);
3525 return;
3526 }
3527
3528 rbd_set_owner_cid(rbd_dev, &cid);
3529 downgrade_write(&rbd_dev->lock_rwsem);
3530 } else {
3531 down_read(&rbd_dev->lock_rwsem);
3532 }
3533
3534 if (!__rbd_is_lock_owner(rbd_dev))
3535 wake_requests(rbd_dev, false);
3536 up_read(&rbd_dev->lock_rwsem);
3537}
3538
3539static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3540 void **p)
3541{
3542 struct rbd_client_id cid = { 0 };
3543
3544 if (struct_v >= 2) {
3545 cid.gid = ceph_decode_64(p);
3546 cid.handle = ceph_decode_64(p);
3547 }
3548
3549 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3550 cid.handle);
3551 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3552 down_write(&rbd_dev->lock_rwsem);
3553 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3554 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3555 __func__, rbd_dev, cid.gid, cid.handle,
3556 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3557 up_write(&rbd_dev->lock_rwsem);
3558 return;
3559 }
3560
3561 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3562 downgrade_write(&rbd_dev->lock_rwsem);
3563 } else {
3564 down_read(&rbd_dev->lock_rwsem);
3565 }
3566
3567 if (!__rbd_is_lock_owner(rbd_dev))
3568 wake_requests(rbd_dev, false);
3569 up_read(&rbd_dev->lock_rwsem);
3570}
3571
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003572/*
3573 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3574 * ResponseMessage is needed.
3575 */
3576static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3577 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003578{
3579 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3580 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003581 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003582
3583 if (struct_v >= 2) {
3584 cid.gid = ceph_decode_64(p);
3585 cid.handle = ceph_decode_64(p);
3586 }
3587
3588 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3589 cid.handle);
3590 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003591 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003592
3593 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003594 if (__rbd_is_lock_owner(rbd_dev)) {
3595 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3596 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3597 goto out_unlock;
3598
3599 /*
3600 * encode ResponseMessage(0) so the peer can detect
3601 * a missing owner
3602 */
3603 result = 0;
3604
3605 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003606 if (!rbd_dev->opts->exclusive) {
3607 dout("%s rbd_dev %p queueing unlock_work\n",
3608 __func__, rbd_dev);
3609 queue_work(rbd_dev->task_wq,
3610 &rbd_dev->unlock_work);
3611 } else {
3612 /* refuse to release the lock */
3613 result = -EROFS;
3614 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003615 }
3616 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003617
3618out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003619 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003620 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003621}
3622
3623static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3624 u64 notify_id, u64 cookie, s32 *result)
3625{
3626 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3627 int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3628 char buf[buf_size];
3629 int ret;
3630
3631 if (result) {
3632 void *p = buf;
3633
3634 /* encode ResponseMessage */
3635 ceph_start_encoding(&p, 1, 1,
3636 buf_size - CEPH_ENCODING_START_BLK_LEN);
3637 ceph_encode_32(&p, *result);
3638 } else {
3639 buf_size = 0;
3640 }
3641
3642 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3643 &rbd_dev->header_oloc, notify_id, cookie,
3644 buf, buf_size);
3645 if (ret)
3646 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3647}
3648
3649static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3650 u64 cookie)
3651{
3652 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3653 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3654}
3655
3656static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3657 u64 notify_id, u64 cookie, s32 result)
3658{
3659 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3660 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3661}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003662
3663static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3664 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003665{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003666 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003667 void *p = data;
3668 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003669 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003670 u32 len;
3671 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003672 int ret;
3673
Ilya Dryomoved95b212016-08-12 16:40:02 +02003674 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3675 __func__, rbd_dev, cookie, notify_id, data_len);
3676 if (data_len) {
3677 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3678 &struct_v, &len);
3679 if (ret) {
3680 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3681 ret);
3682 return;
3683 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003684
Ilya Dryomoved95b212016-08-12 16:40:02 +02003685 notify_op = ceph_decode_32(&p);
3686 } else {
3687 /* legacy notification for header updates */
3688 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3689 len = 0;
3690 }
Alex Elderb8d70032012-11-30 17:53:04 -06003691
Ilya Dryomoved95b212016-08-12 16:40:02 +02003692 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3693 switch (notify_op) {
3694 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3695 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3696 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3697 break;
3698 case RBD_NOTIFY_OP_RELEASED_LOCK:
3699 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3700 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3701 break;
3702 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003703 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3704 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003705 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003706 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003707 else
3708 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3709 break;
3710 case RBD_NOTIFY_OP_HEADER_UPDATE:
3711 ret = rbd_dev_refresh(rbd_dev);
3712 if (ret)
3713 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3714
3715 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3716 break;
3717 default:
3718 if (rbd_is_lock_owner(rbd_dev))
3719 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3720 cookie, -EOPNOTSUPP);
3721 else
3722 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3723 break;
3724 }
Alex Elderb8d70032012-11-30 17:53:04 -06003725}
3726
Ilya Dryomov99d16942016-08-12 16:11:41 +02003727static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3728
Ilya Dryomov922dab62016-05-26 01:15:02 +02003729static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003730{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003731 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003732
Ilya Dryomov922dab62016-05-26 01:15:02 +02003733 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003734
Ilya Dryomoved95b212016-08-12 16:40:02 +02003735 down_write(&rbd_dev->lock_rwsem);
3736 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3737 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003738
Ilya Dryomov99d16942016-08-12 16:11:41 +02003739 mutex_lock(&rbd_dev->watch_mutex);
3740 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3741 __rbd_unregister_watch(rbd_dev);
3742 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003743
Ilya Dryomov99d16942016-08-12 16:11:41 +02003744 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003745 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003746 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003747}
3748
3749/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003750 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003751 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003752static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003753{
3754 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003755 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003756
Ilya Dryomov922dab62016-05-26 01:15:02 +02003757 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003758 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003759
Ilya Dryomov922dab62016-05-26 01:15:02 +02003760 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3761 &rbd_dev->header_oloc, rbd_watch_cb,
3762 rbd_watch_errcb, rbd_dev);
3763 if (IS_ERR(handle))
3764 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003765
Ilya Dryomov922dab62016-05-26 01:15:02 +02003766 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003767 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003768}
3769
Ilya Dryomov99d16942016-08-12 16:11:41 +02003770/*
3771 * watch_mutex must be locked
3772 */
3773static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003774{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003775 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3776 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003777
Ilya Dryomov99d16942016-08-12 16:11:41 +02003778 rbd_assert(rbd_dev->watch_handle);
3779 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003780
Ilya Dryomov922dab62016-05-26 01:15:02 +02003781 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3782 if (ret)
3783 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003784
Ilya Dryomov922dab62016-05-26 01:15:02 +02003785 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003786}
3787
Ilya Dryomov99d16942016-08-12 16:11:41 +02003788static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003789{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003790 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003791
Ilya Dryomov99d16942016-08-12 16:11:41 +02003792 mutex_lock(&rbd_dev->watch_mutex);
3793 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3794 ret = __rbd_register_watch(rbd_dev);
3795 if (ret)
3796 goto out;
3797
3798 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3799 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3800
3801out:
3802 mutex_unlock(&rbd_dev->watch_mutex);
3803 return ret;
3804}
3805
3806static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3807{
3808 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3809
3810 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003811 cancel_work_sync(&rbd_dev->acquired_lock_work);
3812 cancel_work_sync(&rbd_dev->released_lock_work);
3813 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3814 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003815}
3816
3817static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3818{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003819 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003820 cancel_tasks_sync(rbd_dev);
3821
3822 mutex_lock(&rbd_dev->watch_mutex);
3823 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3824 __rbd_unregister_watch(rbd_dev);
3825 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3826 mutex_unlock(&rbd_dev->watch_mutex);
3827
Ilya Dryomov811c6682016-04-15 16:22:16 +02003828 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003829}
3830
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003831/*
3832 * lock_rwsem must be held for write
3833 */
3834static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3835{
3836 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3837 char cookie[32];
3838 int ret;
3839
3840 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3841
3842 format_lock_cookie(rbd_dev, cookie);
3843 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3844 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3845 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3846 RBD_LOCK_TAG, cookie);
3847 if (ret) {
3848 if (ret != -EOPNOTSUPP)
3849 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3850 ret);
3851
3852 /*
3853 * Lock cookie cannot be updated on older OSDs, so do
3854 * a manual release and queue an acquire.
3855 */
3856 if (rbd_release_lock(rbd_dev))
3857 queue_delayed_work(rbd_dev->task_wq,
3858 &rbd_dev->lock_dwork, 0);
3859 } else {
3860 strcpy(rbd_dev->lock_cookie, cookie);
3861 }
3862}
3863
Ilya Dryomov99d16942016-08-12 16:11:41 +02003864static void rbd_reregister_watch(struct work_struct *work)
3865{
3866 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3867 struct rbd_device, watch_dwork);
3868 int ret;
3869
3870 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3871
3872 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003873 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3874 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003875 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003876 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003877
3878 ret = __rbd_register_watch(rbd_dev);
3879 if (ret) {
3880 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003881 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003882 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003883 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003884 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003885 queue_delayed_work(rbd_dev->task_wq,
3886 &rbd_dev->watch_dwork,
3887 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003888 }
3889 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003890 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003891 }
3892
3893 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3894 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3895 mutex_unlock(&rbd_dev->watch_mutex);
3896
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003897 down_write(&rbd_dev->lock_rwsem);
3898 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3899 rbd_reacquire_lock(rbd_dev);
3900 up_write(&rbd_dev->lock_rwsem);
3901
Ilya Dryomov99d16942016-08-12 16:11:41 +02003902 ret = rbd_dev_refresh(rbd_dev);
3903 if (ret)
3904 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003905}
3906
Alex Elder36be9a72013-01-19 00:30:28 -06003907/*
Alex Elderf40eb342013-04-25 15:09:42 -05003908 * Synchronous osd object method call. Returns the number of bytes
3909 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003910 */
3911static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003912 struct ceph_object_id *oid,
3913 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003914 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003915 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003916 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003917 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003918 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003919{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003920 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3921 struct page *req_page = NULL;
3922 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003923 int ret;
3924
3925 /*
Alex Elder6010a452013-04-05 01:27:11 -05003926 * Method calls are ultimately read operations. The result
3927 * should placed into the inbound buffer provided. They
3928 * also supply outbound data--parameters for the object
3929 * method. Currently if this is present it will be a
3930 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003931 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003932 if (outbound) {
3933 if (outbound_size > PAGE_SIZE)
3934 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003935
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003936 req_page = alloc_page(GFP_KERNEL);
3937 if (!req_page)
3938 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003939
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003940 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003941 }
Alex Elder430c28c2013-04-03 21:32:51 -05003942
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003943 reply_page = alloc_page(GFP_KERNEL);
3944 if (!reply_page) {
3945 if (req_page)
3946 __free_page(req_page);
3947 return -ENOMEM;
3948 }
Alex Elder36be9a72013-01-19 00:30:28 -06003949
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003950 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3951 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3952 reply_page, &inbound_size);
3953 if (!ret) {
3954 memcpy(inbound, page_address(reply_page), inbound_size);
3955 ret = inbound_size;
3956 }
Alex Elder57385b52013-04-21 12:14:45 -05003957
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003958 if (req_page)
3959 __free_page(req_page);
3960 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003961 return ret;
3962}
3963
Ilya Dryomoved95b212016-08-12 16:40:02 +02003964/*
3965 * lock_rwsem must be held for read
3966 */
3967static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
3968{
3969 DEFINE_WAIT(wait);
3970
3971 do {
3972 /*
3973 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3974 * and cancel_delayed_work() in wake_requests().
3975 */
3976 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3977 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3978 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3979 TASK_UNINTERRUPTIBLE);
3980 up_read(&rbd_dev->lock_rwsem);
3981 schedule();
3982 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003983 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
3984 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
3985
Ilya Dryomoved95b212016-08-12 16:40:02 +02003986 finish_wait(&rbd_dev->lock_waitq, &wait);
3987}
3988
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003989static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003990{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003991 struct request *rq = blk_mq_rq_from_pdu(work);
3992 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003993 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003994 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003995 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3996 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003997 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003998 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003999 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004000 int result;
4001
Christoph Hellwigaebf5262017-01-31 16:57:31 +01004002 switch (req_op(rq)) {
4003 case REQ_OP_DISCARD:
Ilya Dryomov6ac56952017-05-22 19:59:24 +02004004 case REQ_OP_WRITE_ZEROES:
Christoph Hellwigaebf5262017-01-31 16:57:31 +01004005 op_type = OBJ_OP_DISCARD;
4006 break;
4007 case REQ_OP_WRITE:
4008 op_type = OBJ_OP_WRITE;
4009 break;
4010 case REQ_OP_READ:
4011 op_type = OBJ_OP_READ;
4012 break;
4013 default:
4014 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004015 result = -EIO;
4016 goto err;
4017 }
4018
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004019 /* Ignore/skip any zero-length requests */
4020
4021 if (!length) {
4022 dout("%s: zero-length request\n", __func__);
4023 result = 0;
4024 goto err_rq;
4025 }
4026
Ilya Dryomov9568c932017-10-12 12:35:19 +02004027 rbd_assert(op_type == OBJ_OP_READ ||
4028 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004029
4030 /*
4031 * Quit early if the mapped snapshot no longer exists. It's
4032 * still possible the snapshot will have disappeared by the
4033 * time our request arrives at the osd, but there's no sense in
4034 * sending it if we already know.
4035 */
4036 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4037 dout("request for non-existent snapshot");
4038 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4039 result = -ENXIO;
4040 goto err_rq;
4041 }
4042
4043 if (offset && length > U64_MAX - offset + 1) {
4044 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4045 length);
4046 result = -EINVAL;
4047 goto err_rq; /* Shouldn't happen */
4048 }
4049
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004050 blk_mq_start_request(rq);
4051
Josh Durgin4e752f02014-04-08 11:12:11 -07004052 down_read(&rbd_dev->header_rwsem);
4053 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004054 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07004055 snapc = rbd_dev->header.snapc;
4056 ceph_get_snap_context(snapc);
4057 }
4058 up_read(&rbd_dev->header_rwsem);
4059
4060 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004061 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07004062 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004063 result = -EIO;
4064 goto err_rq;
4065 }
4066
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02004067 must_be_locked =
4068 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
4069 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004070 if (must_be_locked) {
4071 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004072 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
Ilya Dryomove010dd02017-04-13 12:17:39 +02004073 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
4074 if (rbd_dev->opts->exclusive) {
4075 rbd_warn(rbd_dev, "exclusive lock required");
4076 result = -EROFS;
4077 goto err_unlock;
4078 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02004079 rbd_wait_state_locked(rbd_dev);
Ilya Dryomove010dd02017-04-13 12:17:39 +02004080 }
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004081 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
4082 result = -EBLACKLISTED;
4083 goto err_unlock;
4084 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02004085 }
4086
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004087 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07004088 snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004089 if (!img_request) {
4090 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004091 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004092 }
4093 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01004094 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004095
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004096 if (op_type == OBJ_OP_DISCARD)
4097 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
4098 NULL);
4099 else
4100 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
4101 rq->bio);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004102 if (result)
4103 goto err_img_request;
4104
4105 result = rbd_img_request_submit(img_request);
4106 if (result)
4107 goto err_img_request;
4108
Ilya Dryomoved95b212016-08-12 16:40:02 +02004109 if (must_be_locked)
4110 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004111 return;
4112
4113err_img_request:
4114 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004115err_unlock:
4116 if (must_be_locked)
4117 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004118err_rq:
4119 if (result)
4120 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004121 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01004122 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004123err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02004124 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004125}
4126
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004127static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004128 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004129{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004130 struct request *rq = bd->rq;
4131 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004132
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004133 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004134 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06004135}
4136
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004137static void rbd_free_disk(struct rbd_device *rbd_dev)
4138{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004139 blk_cleanup_queue(rbd_dev->disk->queue);
4140 blk_mq_free_tag_set(&rbd_dev->tag_set);
4141 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05004142 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004143}
4144
Alex Elder788e2df2013-01-17 12:25:27 -06004145static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004146 struct ceph_object_id *oid,
4147 struct ceph_object_locator *oloc,
4148 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06004149
4150{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004151 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4152 struct ceph_osd_request *req;
4153 struct page **pages;
4154 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06004155 int ret;
4156
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004157 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4158 if (!req)
4159 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06004160
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004161 ceph_oid_copy(&req->r_base_oid, oid);
4162 ceph_oloc_copy(&req->r_base_oloc, oloc);
4163 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06004164
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004165 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06004166 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004167 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06004168
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004169 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4170 if (IS_ERR(pages)) {
4171 ret = PTR_ERR(pages);
4172 goto out_req;
4173 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06004174
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004175 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4176 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4177 true);
Alex Elder788e2df2013-01-17 12:25:27 -06004178
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004179 ceph_osdc_start_request(osdc, req, false);
4180 ret = ceph_osdc_wait_request(osdc, req);
4181 if (ret >= 0)
4182 ceph_copy_from_page_vector(pages, buf, 0, ret);
4183
4184out_req:
4185 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06004186 return ret;
4187}
4188
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004189/*
Alex Elder662518b2013-05-06 09:51:29 -05004190 * Read the complete header for the given rbd device. On successful
4191 * return, the rbd_dev->header field will contain up-to-date
4192 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05004193 */
Alex Elder99a41eb2013-05-06 09:51:30 -05004194static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05004195{
4196 struct rbd_image_header_ondisk *ondisk = NULL;
4197 u32 snap_count = 0;
4198 u64 names_size = 0;
4199 u32 want_count;
4200 int ret;
4201
4202 /*
4203 * The complete header will include an array of its 64-bit
4204 * snapshot ids, followed by the names of those snapshots as
4205 * a contiguous block of NUL-terminated strings. Note that
4206 * the number of snapshots could change by the time we read
4207 * it in, in which case we re-read it.
4208 */
4209 do {
4210 size_t size;
4211
4212 kfree(ondisk);
4213
4214 size = sizeof (*ondisk);
4215 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4216 size += names_size;
4217 ondisk = kmalloc(size, GFP_KERNEL);
4218 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05004219 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05004220
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004221 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4222 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05004223 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05004224 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004225 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05004226 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004227 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4228 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05004229 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004230 }
4231 if (!rbd_dev_ondisk_valid(ondisk)) {
4232 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004233 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05004234 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004235 }
4236
4237 names_size = le64_to_cpu(ondisk->snap_names_len);
4238 want_count = snap_count;
4239 snap_count = le32_to_cpu(ondisk->snap_count);
4240 } while (snap_count != want_count);
4241
Alex Elder662518b2013-05-06 09:51:29 -05004242 ret = rbd_header_from_disk(rbd_dev, ondisk);
4243out:
Alex Elder4156d992012-08-02 11:29:46 -05004244 kfree(ondisk);
4245
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004246 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004247}
4248
Alex Elder15228ed2013-05-01 12:43:03 -05004249/*
4250 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
4251 * has disappeared from the (just updated) snapshot context.
4252 */
4253static void rbd_exists_validate(struct rbd_device *rbd_dev)
4254{
4255 u64 snap_id;
4256
4257 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
4258 return;
4259
4260 snap_id = rbd_dev->spec->snap_id;
4261 if (snap_id == CEPH_NOSNAP)
4262 return;
4263
4264 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
4265 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4266}
4267
Josh Durgin98752012013-08-29 17:26:31 -07004268static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4269{
4270 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07004271
4272 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02004273 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4274 * try to update its size. If REMOVING is set, updating size
4275 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07004276 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02004277 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4278 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07004279 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4280 dout("setting size to %llu sectors", (unsigned long long)size);
4281 set_capacity(rbd_dev->disk, size);
4282 revalidate_disk(rbd_dev->disk);
4283 }
4284}
4285
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004286static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05004287{
Alex Eldere627db02013-05-06 07:40:30 -05004288 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05004289 int ret;
4290
Alex Eldercfbf6372013-05-31 17:40:45 -05004291 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004292 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004293
4294 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004295 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004296 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004297
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004298 /*
4299 * If there is a parent, see if it has disappeared due to the
4300 * mapped image getting flattened.
4301 */
4302 if (rbd_dev->parent) {
4303 ret = rbd_dev_v2_parent_info(rbd_dev);
4304 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004305 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004306 }
4307
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004308 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004309 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004310 } else {
4311 /* validate mapped snapshot's EXISTS flag */
4312 rbd_exists_validate(rbd_dev);
4313 }
Alex Elder15228ed2013-05-01 12:43:03 -05004314
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004315out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004316 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004317 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004318 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004319
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004320 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004321}
4322
Christoph Hellwigd6296d32017-05-01 10:19:08 -06004323static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4324 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004325{
4326 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4327
4328 INIT_WORK(work, rbd_queue_workfn);
4329 return 0;
4330}
4331
Eric Biggersf363b082017-03-30 13:39:16 -07004332static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004333 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004334 .init_request = rbd_init_request,
4335};
4336
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004337static int rbd_init_disk(struct rbd_device *rbd_dev)
4338{
4339 struct gendisk *disk;
4340 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06004341 u64 segment_size;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004342 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004343
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004344 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004345 disk = alloc_disk(single_major ?
4346 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4347 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004348 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004349 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004350
Alex Elderf0f8cef2012-01-29 13:57:44 -06004351 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004352 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004353 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004354 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004355 if (single_major)
4356 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004357 disk->fops = &rbd_bd_ops;
4358 disk->private_data = rbd_dev;
4359
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004360 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4361 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004362 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004363 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004364 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004365 rbd_dev->tag_set.nr_hw_queues = 1;
4366 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4367
4368 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4369 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004370 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004371
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004372 q = blk_mq_init_queue(&rbd_dev->tag_set);
4373 if (IS_ERR(q)) {
4374 err = PTR_ERR(q);
4375 goto out_tag_set;
4376 }
4377
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004378 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
4379 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004380
Josh Durgin029bcbd2011-07-22 11:35:23 -07004381 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06004382 segment_size = rbd_obj_bytes(&rbd_dev->header);
4383 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004384 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomovd3834fe2015-06-12 19:19:02 +03004385 blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
Alex Elder593a9e72012-02-07 12:03:37 -06004386 blk_queue_max_segment_size(q, segment_size);
4387 blk_queue_io_min(q, segment_size);
4388 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004389
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004390 /* enable the discard support */
4391 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
4392 q->limits.discard_granularity = segment_size;
4393 q->limits.discard_alignment = segment_size;
Jens Axboe2bb4cd52015-07-14 08:15:12 -06004394 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov6ac56952017-05-22 19:59:24 +02004395 blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004396
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004397 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01004398 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004399
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004400 /*
4401 * disk_release() expects a queue ref from add_disk() and will
4402 * put it. Hold an extra ref until add_disk() is called.
4403 */
4404 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004405 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004406 q->queuedata = rbd_dev;
4407
4408 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004409
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004410 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004411out_tag_set:
4412 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004413out_disk:
4414 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004415 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004416}
4417
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004418/*
4419 sysfs
4420*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004421
Alex Elder593a9e72012-02-07 12:03:37 -06004422static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4423{
4424 return container_of(dev, struct rbd_device, dev);
4425}
4426
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004427static ssize_t rbd_size_show(struct device *dev,
4428 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004429{
Alex Elder593a9e72012-02-07 12:03:37 -06004430 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004431
Alex Elderfc71d832013-04-26 15:44:36 -05004432 return sprintf(buf, "%llu\n",
4433 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004434}
4435
Alex Elder34b13182012-07-13 20:35:12 -05004436/*
4437 * Note this shows the features for whatever's mapped, which is not
4438 * necessarily the base image.
4439 */
4440static ssize_t rbd_features_show(struct device *dev,
4441 struct device_attribute *attr, char *buf)
4442{
4443 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4444
4445 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004446 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004447}
4448
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004449static ssize_t rbd_major_show(struct device *dev,
4450 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004451{
Alex Elder593a9e72012-02-07 12:03:37 -06004452 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004453
Alex Elderfc71d832013-04-26 15:44:36 -05004454 if (rbd_dev->major)
4455 return sprintf(buf, "%d\n", rbd_dev->major);
4456
4457 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004458}
Alex Elderfc71d832013-04-26 15:44:36 -05004459
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004460static ssize_t rbd_minor_show(struct device *dev,
4461 struct device_attribute *attr, char *buf)
4462{
4463 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4464
4465 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004466}
4467
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004468static ssize_t rbd_client_addr_show(struct device *dev,
4469 struct device_attribute *attr, char *buf)
4470{
4471 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4472 struct ceph_entity_addr *client_addr =
4473 ceph_client_addr(rbd_dev->rbd_client->client);
4474
4475 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4476 le32_to_cpu(client_addr->nonce));
4477}
4478
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004479static ssize_t rbd_client_id_show(struct device *dev,
4480 struct device_attribute *attr, char *buf)
4481{
Alex Elder593a9e72012-02-07 12:03:37 -06004482 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004483
Alex Elder1dbb4392012-01-24 10:08:37 -06004484 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004485 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004486}
4487
Mike Christie267fb902016-08-18 18:38:43 +02004488static ssize_t rbd_cluster_fsid_show(struct device *dev,
4489 struct device_attribute *attr, char *buf)
4490{
4491 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4492
4493 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4494}
4495
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004496static ssize_t rbd_config_info_show(struct device *dev,
4497 struct device_attribute *attr, char *buf)
4498{
4499 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4500
4501 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004502}
4503
4504static ssize_t rbd_pool_show(struct device *dev,
4505 struct device_attribute *attr, char *buf)
4506{
Alex Elder593a9e72012-02-07 12:03:37 -06004507 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004508
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004509 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004510}
4511
Alex Elder9bb2f332012-07-12 10:46:35 -05004512static ssize_t rbd_pool_id_show(struct device *dev,
4513 struct device_attribute *attr, char *buf)
4514{
4515 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4516
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004517 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004518 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004519}
4520
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004521static ssize_t rbd_name_show(struct device *dev,
4522 struct device_attribute *attr, char *buf)
4523{
Alex Elder593a9e72012-02-07 12:03:37 -06004524 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004525
Alex Eldera92ffdf2012-10-30 19:40:33 -05004526 if (rbd_dev->spec->image_name)
4527 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4528
4529 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004530}
4531
Alex Elder589d30e2012-07-10 20:30:11 -05004532static ssize_t rbd_image_id_show(struct device *dev,
4533 struct device_attribute *attr, char *buf)
4534{
4535 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4536
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004537 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004538}
4539
Alex Elder34b13182012-07-13 20:35:12 -05004540/*
4541 * Shows the name of the currently-mapped snapshot (or
4542 * RBD_SNAP_HEAD_NAME for the base image).
4543 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004544static ssize_t rbd_snap_show(struct device *dev,
4545 struct device_attribute *attr,
4546 char *buf)
4547{
Alex Elder593a9e72012-02-07 12:03:37 -06004548 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004549
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004550 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004551}
4552
Mike Christie92a58672016-08-18 18:38:44 +02004553static ssize_t rbd_snap_id_show(struct device *dev,
4554 struct device_attribute *attr, char *buf)
4555{
4556 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4557
4558 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4559}
4560
Alex Elder86b00e02012-10-25 23:34:42 -05004561/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004562 * For a v2 image, shows the chain of parent images, separated by empty
4563 * lines. For v1 images or if there is no parent, shows "(no parent
4564 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004565 */
4566static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004567 struct device_attribute *attr,
4568 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004569{
4570 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004571 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004572
Ilya Dryomovff961282014-07-22 21:53:07 +04004573 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004574 return sprintf(buf, "(no parent image)\n");
4575
Ilya Dryomovff961282014-07-22 21:53:07 +04004576 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4577 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004578
Ilya Dryomovff961282014-07-22 21:53:07 +04004579 count += sprintf(&buf[count], "%s"
4580 "pool_id %llu\npool_name %s\n"
4581 "image_id %s\nimage_name %s\n"
4582 "snap_id %llu\nsnap_name %s\n"
4583 "overlap %llu\n",
4584 !count ? "" : "\n", /* first? */
4585 spec->pool_id, spec->pool_name,
4586 spec->image_id, spec->image_name ?: "(unknown)",
4587 spec->snap_id, spec->snap_name,
4588 rbd_dev->parent_overlap);
4589 }
Alex Elder86b00e02012-10-25 23:34:42 -05004590
Ilya Dryomovff961282014-07-22 21:53:07 +04004591 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004592}
4593
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004594static ssize_t rbd_image_refresh(struct device *dev,
4595 struct device_attribute *attr,
4596 const char *buf,
4597 size_t size)
4598{
Alex Elder593a9e72012-02-07 12:03:37 -06004599 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004600 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004601
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004602 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004603 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004604 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004605
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004606 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004607}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004608
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004609static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05004610static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004611static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004612static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004613static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004614static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
Mike Christie267fb902016-08-18 18:38:43 +02004615static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004616static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004617static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05004618static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004619static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05004620static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004621static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4622static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Mike Christie92a58672016-08-18 18:38:44 +02004623static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05004624static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004625
4626static struct attribute *rbd_attrs[] = {
4627 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004628 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004629 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004630 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004631 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004632 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004633 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004634 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004635 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004636 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004637 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004638 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004639 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004640 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004641 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004642 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004643 NULL
4644};
4645
4646static struct attribute_group rbd_attr_group = {
4647 .attrs = rbd_attrs,
4648};
4649
4650static const struct attribute_group *rbd_attr_groups[] = {
4651 &rbd_attr_group,
4652 NULL
4653};
4654
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004655static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004656
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304657static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004658 .name = "rbd",
4659 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004660 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004661};
4662
Alex Elder8b8fb992012-10-26 17:25:24 -05004663static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4664{
4665 kref_get(&spec->kref);
4666
4667 return spec;
4668}
4669
4670static void rbd_spec_free(struct kref *kref);
4671static void rbd_spec_put(struct rbd_spec *spec)
4672{
4673 if (spec)
4674 kref_put(&spec->kref, rbd_spec_free);
4675}
4676
4677static struct rbd_spec *rbd_spec_alloc(void)
4678{
4679 struct rbd_spec *spec;
4680
4681 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4682 if (!spec)
4683 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004684
4685 spec->pool_id = CEPH_NOPOOL;
4686 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004687 kref_init(&spec->kref);
4688
Alex Elder8b8fb992012-10-26 17:25:24 -05004689 return spec;
4690}
4691
4692static void rbd_spec_free(struct kref *kref)
4693{
4694 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4695
4696 kfree(spec->pool_name);
4697 kfree(spec->image_id);
4698 kfree(spec->image_name);
4699 kfree(spec->snap_name);
4700 kfree(spec);
4701}
4702
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004703static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004704{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004705 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004706 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004707
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004708 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004709 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004710 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004711
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004712 rbd_put_client(rbd_dev->rbd_client);
4713 rbd_spec_put(rbd_dev->spec);
4714 kfree(rbd_dev->opts);
4715 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004716}
4717
4718static void rbd_dev_release(struct device *dev)
4719{
4720 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4721 bool need_put = !!rbd_dev->opts;
4722
4723 if (need_put) {
4724 destroy_workqueue(rbd_dev->task_wq);
4725 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4726 }
4727
4728 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004729
4730 /*
4731 * This is racy, but way better than putting module outside of
4732 * the release callback. The race window is pretty small, so
4733 * doing something similar to dm (dm-builtin.c) is overkill.
4734 */
4735 if (need_put)
4736 module_put(THIS_MODULE);
4737}
4738
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004739static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4740 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004741{
4742 struct rbd_device *rbd_dev;
4743
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004744 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004745 if (!rbd_dev)
4746 return NULL;
4747
4748 spin_lock_init(&rbd_dev->lock);
4749 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004750 init_rwsem(&rbd_dev->header_rwsem);
4751
Ilya Dryomov7e973322017-01-25 18:16:22 +01004752 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004753 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004754 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004755
Ilya Dryomov99d16942016-08-12 16:11:41 +02004756 mutex_init(&rbd_dev->watch_mutex);
4757 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4758 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4759
Ilya Dryomoved95b212016-08-12 16:40:02 +02004760 init_rwsem(&rbd_dev->lock_rwsem);
4761 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4762 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4763 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4764 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4765 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4766 init_waitqueue_head(&rbd_dev->lock_waitq);
4767
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004768 rbd_dev->dev.bus = &rbd_bus_type;
4769 rbd_dev->dev.type = &rbd_device_type;
4770 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004771 device_initialize(&rbd_dev->dev);
4772
Alex Elderc53d5892012-10-25 23:34:42 -05004773 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004774 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004775
Alex Elderc53d5892012-10-25 23:34:42 -05004776 return rbd_dev;
4777}
4778
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004779/*
4780 * Create a mapping rbd_dev.
4781 */
4782static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4783 struct rbd_spec *spec,
4784 struct rbd_options *opts)
4785{
4786 struct rbd_device *rbd_dev;
4787
4788 rbd_dev = __rbd_dev_create(rbdc, spec);
4789 if (!rbd_dev)
4790 return NULL;
4791
4792 rbd_dev->opts = opts;
4793
4794 /* get an id and fill in device name */
4795 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4796 minor_to_rbd_dev_id(1 << MINORBITS),
4797 GFP_KERNEL);
4798 if (rbd_dev->dev_id < 0)
4799 goto fail_rbd_dev;
4800
4801 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4802 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4803 rbd_dev->name);
4804 if (!rbd_dev->task_wq)
4805 goto fail_dev_id;
4806
4807 /* we have a ref from do_rbd_add() */
4808 __module_get(THIS_MODULE);
4809
4810 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4811 return rbd_dev;
4812
4813fail_dev_id:
4814 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4815fail_rbd_dev:
4816 rbd_dev_free(rbd_dev);
4817 return NULL;
4818}
4819
Alex Elderc53d5892012-10-25 23:34:42 -05004820static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4821{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004822 if (rbd_dev)
4823 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004824}
4825
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004826/*
Alex Elder9d475de2012-07-03 16:01:19 -05004827 * Get the size and object order for an image snapshot, or if
4828 * snap_id is CEPH_NOSNAP, gets this information for the base
4829 * image.
4830 */
4831static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4832 u8 *order, u64 *snap_size)
4833{
4834 __le64 snapid = cpu_to_le64(snap_id);
4835 int ret;
4836 struct {
4837 u8 order;
4838 __le64 size;
4839 } __attribute__ ((packed)) size_buf = { 0 };
4840
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004841 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4842 &rbd_dev->header_oloc, "get_size",
4843 &snapid, sizeof(snapid),
4844 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004845 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004846 if (ret < 0)
4847 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004848 if (ret < sizeof (size_buf))
4849 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004850
Josh Durginc3545572013-08-28 17:08:10 -07004851 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004852 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004853 dout(" order %u", (unsigned int)*order);
4854 }
Alex Elder9d475de2012-07-03 16:01:19 -05004855 *snap_size = le64_to_cpu(size_buf.size);
4856
Josh Durginc3545572013-08-28 17:08:10 -07004857 dout(" snap_id 0x%016llx snap_size = %llu\n",
4858 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004859 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004860
4861 return 0;
4862}
4863
4864static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4865{
4866 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4867 &rbd_dev->header.obj_order,
4868 &rbd_dev->header.image_size);
4869}
4870
Alex Elder1e130192012-07-03 16:01:19 -05004871static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4872{
4873 void *reply_buf;
4874 int ret;
4875 void *p;
4876
4877 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4878 if (!reply_buf)
4879 return -ENOMEM;
4880
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004881 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4882 &rbd_dev->header_oloc, "get_object_prefix",
4883 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004884 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004885 if (ret < 0)
4886 goto out;
4887
4888 p = reply_buf;
4889 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004890 p + ret, NULL, GFP_NOIO);
4891 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004892
4893 if (IS_ERR(rbd_dev->header.object_prefix)) {
4894 ret = PTR_ERR(rbd_dev->header.object_prefix);
4895 rbd_dev->header.object_prefix = NULL;
4896 } else {
4897 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4898 }
Alex Elder1e130192012-07-03 16:01:19 -05004899out:
4900 kfree(reply_buf);
4901
4902 return ret;
4903}
4904
Alex Elderb1b54022012-07-03 16:01:19 -05004905static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4906 u64 *snap_features)
4907{
4908 __le64 snapid = cpu_to_le64(snap_id);
4909 struct {
4910 __le64 features;
4911 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004912 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004913 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004914 int ret;
4915
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004916 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4917 &rbd_dev->header_oloc, "get_features",
4918 &snapid, sizeof(snapid),
4919 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004920 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004921 if (ret < 0)
4922 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004923 if (ret < sizeof (features_buf))
4924 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004925
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004926 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4927 if (unsup) {
4928 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4929 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004930 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004931 }
Alex Elderd8891402012-10-09 13:50:17 -07004932
Alex Elderb1b54022012-07-03 16:01:19 -05004933 *snap_features = le64_to_cpu(features_buf.features);
4934
4935 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004936 (unsigned long long)snap_id,
4937 (unsigned long long)*snap_features,
4938 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004939
4940 return 0;
4941}
4942
4943static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4944{
4945 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4946 &rbd_dev->header.features);
4947}
4948
Alex Elder86b00e02012-10-25 23:34:42 -05004949static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4950{
4951 struct rbd_spec *parent_spec;
4952 size_t size;
4953 void *reply_buf = NULL;
4954 __le64 snapid;
4955 void *p;
4956 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05004957 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004958 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004959 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004960 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05004961 int ret;
4962
4963 parent_spec = rbd_spec_alloc();
4964 if (!parent_spec)
4965 return -ENOMEM;
4966
4967 size = sizeof (__le64) + /* pool_id */
4968 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
4969 sizeof (__le64) + /* snap_id */
4970 sizeof (__le64); /* overlap */
4971 reply_buf = kmalloc(size, GFP_KERNEL);
4972 if (!reply_buf) {
4973 ret = -ENOMEM;
4974 goto out_err;
4975 }
4976
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04004977 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004978 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4979 &rbd_dev->header_oloc, "get_parent",
4980 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004981 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05004982 if (ret < 0)
4983 goto out_err;
4984
Alex Elder86b00e02012-10-25 23:34:42 -05004985 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004986 end = reply_buf + ret;
4987 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05004988 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05004989 if (pool_id == CEPH_NOPOOL) {
4990 /*
4991 * Either the parent never existed, or we have
4992 * record of it but the image got flattened so it no
4993 * longer has a parent. When the parent of a
4994 * layered image disappears we immediately set the
4995 * overlap to 0. The effect of this is that all new
4996 * requests will be treated as if the image had no
4997 * parent.
4998 */
4999 if (rbd_dev->parent_overlap) {
5000 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05005001 rbd_dev_parent_put(rbd_dev);
5002 pr_info("%s: clone image has been flattened\n",
5003 rbd_dev->disk->disk_name);
5004 }
5005
Alex Elder86b00e02012-10-25 23:34:42 -05005006 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05005007 }
Alex Elder86b00e02012-10-25 23:34:42 -05005008
Alex Elder0903e872012-11-14 12:25:19 -06005009 /* The ceph file layout needs to fit pool id in 32 bits */
5010
5011 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05005012 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04005013 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05005014 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05005015 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005016 }
Alex Elder0903e872012-11-14 12:25:19 -06005017
Alex Elder979ed482012-11-01 08:39:26 -05005018 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05005019 if (IS_ERR(image_id)) {
5020 ret = PTR_ERR(image_id);
5021 goto out_err;
5022 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005023 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05005024 ceph_decode_64_safe(&p, end, overlap, out_err);
5025
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005026 /*
5027 * The parent won't change (except when the clone is
5028 * flattened, already handled that). So we only need to
5029 * record the parent spec we have not already done so.
5030 */
5031 if (!rbd_dev->parent_spec) {
5032 parent_spec->pool_id = pool_id;
5033 parent_spec->image_id = image_id;
5034 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05005035 rbd_dev->parent_spec = parent_spec;
5036 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04005037 } else {
5038 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005039 }
5040
5041 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005042 * We always update the parent overlap. If it's zero we issue
5043 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005044 */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005045 if (!overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005046 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005047 /* refresh, careful to warn just once */
5048 if (rbd_dev->parent_overlap)
5049 rbd_warn(rbd_dev,
5050 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005051 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005052 /* initial probe */
5053 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005054 }
Alex Elder70cf49c2013-05-06 17:40:33 -05005055 }
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005056 rbd_dev->parent_overlap = overlap;
5057
Alex Elder86b00e02012-10-25 23:34:42 -05005058out:
5059 ret = 0;
5060out_err:
5061 kfree(reply_buf);
5062 rbd_spec_put(parent_spec);
5063
5064 return ret;
5065}
5066
Alex Eldercc070d52013-04-21 12:14:45 -05005067static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5068{
5069 struct {
5070 __le64 stripe_unit;
5071 __le64 stripe_count;
5072 } __attribute__ ((packed)) striping_info_buf = { 0 };
5073 size_t size = sizeof (striping_info_buf);
5074 void *p;
5075 u64 obj_size;
5076 u64 stripe_unit;
5077 u64 stripe_count;
5078 int ret;
5079
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005080 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5081 &rbd_dev->header_oloc, "get_stripe_unit_count",
5082 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05005083 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5084 if (ret < 0)
5085 return ret;
5086 if (ret < size)
5087 return -ERANGE;
5088
5089 /*
5090 * We don't actually support the "fancy striping" feature
5091 * (STRIPINGV2) yet, but if the striping sizes are the
5092 * defaults the behavior is the same as before. So find
5093 * out, and only fail if the image has non-default values.
5094 */
5095 ret = -EINVAL;
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01005096 obj_size = rbd_obj_bytes(&rbd_dev->header);
Alex Eldercc070d52013-04-21 12:14:45 -05005097 p = &striping_info_buf;
5098 stripe_unit = ceph_decode_64(&p);
5099 if (stripe_unit != obj_size) {
5100 rbd_warn(rbd_dev, "unsupported stripe unit "
5101 "(got %llu want %llu)",
5102 stripe_unit, obj_size);
5103 return -EINVAL;
5104 }
5105 stripe_count = ceph_decode_64(&p);
5106 if (stripe_count != 1) {
5107 rbd_warn(rbd_dev, "unsupported stripe count "
5108 "(got %llu want 1)", stripe_count);
5109 return -EINVAL;
5110 }
Alex Elder500d0c02013-04-26 09:43:47 -05005111 rbd_dev->header.stripe_unit = stripe_unit;
5112 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05005113
5114 return 0;
5115}
5116
Ilya Dryomov7e973322017-01-25 18:16:22 +01005117static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5118{
5119 __le64 data_pool_id;
5120 int ret;
5121
5122 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5123 &rbd_dev->header_oloc, "get_data_pool",
5124 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5125 if (ret < 0)
5126 return ret;
5127 if (ret < sizeof(data_pool_id))
5128 return -EBADMSG;
5129
5130 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5131 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5132 return 0;
5133}
5134
Alex Elder9e15b772012-10-30 19:40:33 -05005135static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5136{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005137 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05005138 size_t image_id_size;
5139 char *image_id;
5140 void *p;
5141 void *end;
5142 size_t size;
5143 void *reply_buf = NULL;
5144 size_t len = 0;
5145 char *image_name = NULL;
5146 int ret;
5147
5148 rbd_assert(!rbd_dev->spec->image_name);
5149
Alex Elder69e7a022012-11-01 08:39:26 -05005150 len = strlen(rbd_dev->spec->image_id);
5151 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05005152 image_id = kmalloc(image_id_size, GFP_KERNEL);
5153 if (!image_id)
5154 return NULL;
5155
5156 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05005157 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05005158 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05005159
5160 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5161 reply_buf = kmalloc(size, GFP_KERNEL);
5162 if (!reply_buf)
5163 goto out;
5164
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005165 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5166 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5167 "dir_get_name", image_id, image_id_size,
5168 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005169 if (ret < 0)
5170 goto out;
5171 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005172 end = reply_buf + ret;
5173
Alex Elder9e15b772012-10-30 19:40:33 -05005174 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5175 if (IS_ERR(image_name))
5176 image_name = NULL;
5177 else
5178 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5179out:
5180 kfree(reply_buf);
5181 kfree(image_id);
5182
5183 return image_name;
5184}
5185
Alex Elder2ad3d712013-04-30 00:44:33 -05005186static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5187{
5188 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5189 const char *snap_name;
5190 u32 which = 0;
5191
5192 /* Skip over names until we find the one we are looking for */
5193
5194 snap_name = rbd_dev->header.snap_names;
5195 while (which < snapc->num_snaps) {
5196 if (!strcmp(name, snap_name))
5197 return snapc->snaps[which];
5198 snap_name += strlen(snap_name) + 1;
5199 which++;
5200 }
5201 return CEPH_NOSNAP;
5202}
5203
5204static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5205{
5206 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5207 u32 which;
5208 bool found = false;
5209 u64 snap_id;
5210
5211 for (which = 0; !found && which < snapc->num_snaps; which++) {
5212 const char *snap_name;
5213
5214 snap_id = snapc->snaps[which];
5215 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005216 if (IS_ERR(snap_name)) {
5217 /* ignore no-longer existing snapshots */
5218 if (PTR_ERR(snap_name) == -ENOENT)
5219 continue;
5220 else
5221 break;
5222 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005223 found = !strcmp(name, snap_name);
5224 kfree(snap_name);
5225 }
5226 return found ? snap_id : CEPH_NOSNAP;
5227}
5228
5229/*
5230 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5231 * no snapshot by that name is found, or if an error occurs.
5232 */
5233static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5234{
5235 if (rbd_dev->image_format == 1)
5236 return rbd_v1_snap_id_by_name(rbd_dev, name);
5237
5238 return rbd_v2_snap_id_by_name(rbd_dev, name);
5239}
5240
Alex Elder9e15b772012-10-30 19:40:33 -05005241/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005242 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005243 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005244static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5245{
5246 struct rbd_spec *spec = rbd_dev->spec;
5247
5248 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5249 rbd_assert(spec->image_id && spec->image_name);
5250 rbd_assert(spec->snap_name);
5251
5252 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5253 u64 snap_id;
5254
5255 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5256 if (snap_id == CEPH_NOSNAP)
5257 return -ENOENT;
5258
5259 spec->snap_id = snap_id;
5260 } else {
5261 spec->snap_id = CEPH_NOSNAP;
5262 }
5263
5264 return 0;
5265}
5266
5267/*
5268 * A parent image will have all ids but none of the names.
5269 *
5270 * All names in an rbd spec are dynamically allocated. It's OK if we
5271 * can't figure out the name for an image id.
5272 */
5273static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005274{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005275 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5276 struct rbd_spec *spec = rbd_dev->spec;
5277 const char *pool_name;
5278 const char *image_name;
5279 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005280 int ret;
5281
Ilya Dryomov04077592014-07-23 17:11:20 +04005282 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5283 rbd_assert(spec->image_id);
5284 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005285
Alex Elder2e9f7f12013-04-26 09:43:48 -05005286 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005287
Alex Elder2e9f7f12013-04-26 09:43:48 -05005288 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5289 if (!pool_name) {
5290 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005291 return -EIO;
5292 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005293 pool_name = kstrdup(pool_name, GFP_KERNEL);
5294 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005295 return -ENOMEM;
5296
5297 /* Fetch the image name; tolerate failure here */
5298
Alex Elder2e9f7f12013-04-26 09:43:48 -05005299 image_name = rbd_dev_image_name(rbd_dev);
5300 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005301 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005302
Ilya Dryomov04077592014-07-23 17:11:20 +04005303 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005304
Alex Elder2e9f7f12013-04-26 09:43:48 -05005305 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005306 if (IS_ERR(snap_name)) {
5307 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005308 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005309 }
5310
5311 spec->pool_name = pool_name;
5312 spec->image_name = image_name;
5313 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005314
5315 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005316
Alex Elder9e15b772012-10-30 19:40:33 -05005317out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005318 kfree(image_name);
5319 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005320 return ret;
5321}
5322
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005323static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005324{
5325 size_t size;
5326 int ret;
5327 void *reply_buf;
5328 void *p;
5329 void *end;
5330 u64 seq;
5331 u32 snap_count;
5332 struct ceph_snap_context *snapc;
5333 u32 i;
5334
5335 /*
5336 * We'll need room for the seq value (maximum snapshot id),
5337 * snapshot count, and array of that many snapshot ids.
5338 * For now we have a fixed upper limit on the number we're
5339 * prepared to receive.
5340 */
5341 size = sizeof (__le64) + sizeof (__le32) +
5342 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5343 reply_buf = kzalloc(size, GFP_KERNEL);
5344 if (!reply_buf)
5345 return -ENOMEM;
5346
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005347 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5348 &rbd_dev->header_oloc, "get_snapcontext",
5349 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005350 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005351 if (ret < 0)
5352 goto out;
5353
Alex Elder35d489f2012-07-03 16:01:19 -05005354 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005355 end = reply_buf + ret;
5356 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005357 ceph_decode_64_safe(&p, end, seq, out);
5358 ceph_decode_32_safe(&p, end, snap_count, out);
5359
5360 /*
5361 * Make sure the reported number of snapshot ids wouldn't go
5362 * beyond the end of our buffer. But before checking that,
5363 * make sure the computed size of the snapshot context we
5364 * allocate is representable in a size_t.
5365 */
5366 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5367 / sizeof (u64)) {
5368 ret = -EINVAL;
5369 goto out;
5370 }
5371 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5372 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005373 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005374
Alex Elder812164f82013-04-30 00:44:32 -05005375 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005376 if (!snapc) {
5377 ret = -ENOMEM;
5378 goto out;
5379 }
Alex Elder35d489f2012-07-03 16:01:19 -05005380 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005381 for (i = 0; i < snap_count; i++)
5382 snapc->snaps[i] = ceph_decode_64(&p);
5383
Alex Elder49ece552013-05-06 08:37:00 -05005384 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005385 rbd_dev->header.snapc = snapc;
5386
5387 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005388 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005389out:
5390 kfree(reply_buf);
5391
Alex Elder57385b52013-04-21 12:14:45 -05005392 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005393}
5394
Alex Elder54cac612013-04-30 00:44:33 -05005395static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5396 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005397{
5398 size_t size;
5399 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005400 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005401 int ret;
5402 void *p;
5403 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005404 char *snap_name;
5405
5406 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5407 reply_buf = kmalloc(size, GFP_KERNEL);
5408 if (!reply_buf)
5409 return ERR_PTR(-ENOMEM);
5410
Alex Elder54cac612013-04-30 00:44:33 -05005411 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005412 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5413 &rbd_dev->header_oloc, "get_snapshot_name",
5414 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005415 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005416 if (ret < 0) {
5417 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005418 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005419 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005420
5421 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005422 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005423 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005424 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005425 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005426
Alex Elderf40eb342013-04-25 15:09:42 -05005427 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005428 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005429out:
5430 kfree(reply_buf);
5431
Alex Elderf40eb342013-04-25 15:09:42 -05005432 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005433}
5434
Alex Elder2df3fac2013-05-06 09:51:30 -05005435static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005436{
Alex Elder2df3fac2013-05-06 09:51:30 -05005437 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005438 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005439
Josh Durgin1617e402013-06-12 14:43:10 -07005440 ret = rbd_dev_v2_image_size(rbd_dev);
5441 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005442 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005443
Alex Elder2df3fac2013-05-06 09:51:30 -05005444 if (first_time) {
5445 ret = rbd_dev_v2_header_onetime(rbd_dev);
5446 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005447 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005448 }
5449
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005450 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005451 if (ret && first_time) {
5452 kfree(rbd_dev->header.object_prefix);
5453 rbd_dev->header.object_prefix = NULL;
5454 }
Alex Elder117973f2012-08-31 17:29:55 -05005455
5456 return ret;
5457}
5458
Ilya Dryomova720ae02014-07-23 17:11:19 +04005459static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5460{
5461 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5462
5463 if (rbd_dev->image_format == 1)
5464 return rbd_dev_v1_header_info(rbd_dev);
5465
5466 return rbd_dev_v2_header_info(rbd_dev);
5467}
5468
Alex Elder1ddbe942012-01-29 13:57:44 -06005469/*
Alex Eldere28fff262012-02-02 08:13:30 -06005470 * Skips over white space at *buf, and updates *buf to point to the
5471 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005472 * the token (string of non-white space characters) found. Note
5473 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005474 */
5475static inline size_t next_token(const char **buf)
5476{
5477 /*
5478 * These are the characters that produce nonzero for
5479 * isspace() in the "C" and "POSIX" locales.
5480 */
5481 const char *spaces = " \f\n\r\t\v";
5482
5483 *buf += strspn(*buf, spaces); /* Find start of token */
5484
5485 return strcspn(*buf, spaces); /* Return token length */
5486}
5487
5488/*
Alex Elderea3352f2012-07-09 21:04:23 -05005489 * Finds the next token in *buf, dynamically allocates a buffer big
5490 * enough to hold a copy of it, and copies the token into the new
5491 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5492 * that a duplicate buffer is created even for a zero-length token.
5493 *
5494 * Returns a pointer to the newly-allocated duplicate, or a null
5495 * pointer if memory for the duplicate was not available. If
5496 * the lenp argument is a non-null pointer, the length of the token
5497 * (not including the '\0') is returned in *lenp.
5498 *
5499 * If successful, the *buf pointer will be updated to point beyond
5500 * the end of the found token.
5501 *
5502 * Note: uses GFP_KERNEL for allocation.
5503 */
5504static inline char *dup_token(const char **buf, size_t *lenp)
5505{
5506 char *dup;
5507 size_t len;
5508
5509 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005510 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005511 if (!dup)
5512 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005513 *(dup + len) = '\0';
5514 *buf += len;
5515
5516 if (lenp)
5517 *lenp = len;
5518
5519 return dup;
5520}
5521
5522/*
Alex Elder859c31d2012-10-25 23:34:42 -05005523 * Parse the options provided for an "rbd add" (i.e., rbd image
5524 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5525 * and the data written is passed here via a NUL-terminated buffer.
5526 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005527 *
Alex Elder859c31d2012-10-25 23:34:42 -05005528 * The information extracted from these options is recorded in
5529 * the other parameters which return dynamically-allocated
5530 * structures:
5531 * ceph_opts
5532 * The address of a pointer that will refer to a ceph options
5533 * structure. Caller must release the returned pointer using
5534 * ceph_destroy_options() when it is no longer needed.
5535 * rbd_opts
5536 * Address of an rbd options pointer. Fully initialized by
5537 * this function; caller must release with kfree().
5538 * spec
5539 * Address of an rbd image specification pointer. Fully
5540 * initialized by this function based on parsed options.
5541 * Caller must release with rbd_spec_put().
5542 *
5543 * The options passed take this form:
5544 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5545 * where:
5546 * <mon_addrs>
5547 * A comma-separated list of one or more monitor addresses.
5548 * A monitor address is an ip address, optionally followed
5549 * by a port number (separated by a colon).
5550 * I.e.: ip1[:port1][,ip2[:port2]...]
5551 * <options>
5552 * A comma-separated list of ceph and/or rbd options.
5553 * <pool_name>
5554 * The name of the rados pool containing the rbd image.
5555 * <image_name>
5556 * The name of the image in that pool to map.
5557 * <snap_id>
5558 * An optional snapshot id. If provided, the mapping will
5559 * present data from the image at the time that snapshot was
5560 * created. The image head is used if no snapshot id is
5561 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005562 */
Alex Elder859c31d2012-10-25 23:34:42 -05005563static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005564 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005565 struct rbd_options **opts,
5566 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005567{
Alex Elderd22f76e2012-07-12 10:46:35 -05005568 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005569 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005570 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005571 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005572 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05005573 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005574 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005575 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005576 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005577
5578 /* The first four tokens are required */
5579
Alex Elder7ef32142012-02-02 08:13:30 -06005580 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005581 if (!len) {
5582 rbd_warn(NULL, "no monitor address(es) provided");
5583 return -EINVAL;
5584 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005585 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005586 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005587 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005588
Alex Elderdc79b112012-10-25 23:34:41 -05005589 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005590 options = dup_token(&buf, NULL);
5591 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005592 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005593 if (!*options) {
5594 rbd_warn(NULL, "no options provided");
5595 goto out_err;
5596 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005597
Alex Elder859c31d2012-10-25 23:34:42 -05005598 spec = rbd_spec_alloc();
5599 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005600 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005601
5602 spec->pool_name = dup_token(&buf, NULL);
5603 if (!spec->pool_name)
5604 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005605 if (!*spec->pool_name) {
5606 rbd_warn(NULL, "no pool name provided");
5607 goto out_err;
5608 }
Alex Eldere28fff262012-02-02 08:13:30 -06005609
Alex Elder69e7a022012-11-01 08:39:26 -05005610 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05005611 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005612 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005613 if (!*spec->image_name) {
5614 rbd_warn(NULL, "no image name provided");
5615 goto out_err;
5616 }
Alex Eldere28fff262012-02-02 08:13:30 -06005617
Alex Elderf28e5652012-10-25 23:34:41 -05005618 /*
5619 * Snapshot name is optional; default is to use "-"
5620 * (indicating the head/no snapshot).
5621 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005622 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005623 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005624 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5625 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005626 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005627 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005628 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005629 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005630 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5631 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005632 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005633 *(snap_name + len) = '\0';
5634 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005635
Alex Elder0ddebc02012-10-25 23:34:41 -05005636 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005637
Alex Elder4e9afeb2012-10-25 23:34:41 -05005638 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
5639 if (!rbd_opts)
5640 goto out_mem;
5641
5642 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Ilya Dryomovb5584182015-06-23 16:21:19 +03005643 rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov80de1912016-09-20 14:23:17 +02005644 rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
Ilya Dryomove010dd02017-04-13 12:17:39 +02005645 rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005646
Alex Elder859c31d2012-10-25 23:34:42 -05005647 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05005648 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05005649 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05005650 if (IS_ERR(copts)) {
5651 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005652 goto out_err;
5653 }
Alex Elder859c31d2012-10-25 23:34:42 -05005654 kfree(options);
5655
5656 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005657 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05005658 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005659
Alex Elderdc79b112012-10-25 23:34:41 -05005660 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005661out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005662 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005663out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05005664 kfree(rbd_opts);
5665 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005666 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005667
Alex Elderdc79b112012-10-25 23:34:41 -05005668 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005669}
5670
Alex Elder589d30e2012-07-10 20:30:11 -05005671/*
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005672 * Return pool id (>= 0) or a negative error code.
5673 */
5674static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
5675{
Ilya Dryomova319bf52015-05-15 12:02:17 +03005676 struct ceph_options *opts = rbdc->client->options;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005677 u64 newest_epoch;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005678 int tries = 0;
5679 int ret;
5680
5681again:
5682 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
5683 if (ret == -ENOENT && tries++ < 1) {
Ilya Dryomovd0b19702016-04-28 16:07:27 +02005684 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
5685 &newest_epoch);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005686 if (ret < 0)
5687 return ret;
5688
5689 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
Ilya Dryomov7cca78c2016-04-28 16:07:28 +02005690 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005691 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
Ilya Dryomova319bf52015-05-15 12:02:17 +03005692 newest_epoch,
5693 opts->mount_timeout);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005694 goto again;
5695 } else {
5696 /* the osdmap we have is new enough */
5697 return -ENOENT;
5698 }
5699 }
5700
5701 return ret;
5702}
5703
Ilya Dryomove010dd02017-04-13 12:17:39 +02005704static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5705{
5706 down_write(&rbd_dev->lock_rwsem);
5707 if (__rbd_is_lock_owner(rbd_dev))
5708 rbd_unlock(rbd_dev);
5709 up_write(&rbd_dev->lock_rwsem);
5710}
5711
5712static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5713{
5714 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5715 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5716 return -EINVAL;
5717 }
5718
5719 /* FIXME: "rbd map --exclusive" should be in interruptible */
5720 down_read(&rbd_dev->lock_rwsem);
5721 rbd_wait_state_locked(rbd_dev);
5722 up_read(&rbd_dev->lock_rwsem);
5723 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
5724 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5725 return -EROFS;
5726 }
5727
5728 return 0;
5729}
5730
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005731/*
Alex Elder589d30e2012-07-10 20:30:11 -05005732 * An rbd format 2 image has a unique identifier, distinct from the
5733 * name given to it by the user. Internally, that identifier is
5734 * what's used to specify the names of objects related to the image.
5735 *
5736 * A special "rbd id" object is used to map an rbd image name to its
5737 * id. If that object doesn't exist, then there is no v2 rbd image
5738 * with the supplied name.
5739 *
5740 * This function will record the given rbd_dev's image_id field if
5741 * it can be determined, and in that case will return 0. If any
5742 * errors occur a negative errno will be returned and the rbd_dev's
5743 * image_id field will be unchanged (and should be NULL).
5744 */
5745static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5746{
5747 int ret;
5748 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005749 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005750 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005751 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005752
Alex Elder589d30e2012-07-10 20:30:11 -05005753 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005754 * When probing a parent image, the image id is already
5755 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005756 * need to fetch the image id again in this case. We
5757 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005758 */
Alex Elderc0fba362013-04-25 23:15:08 -05005759 if (rbd_dev->spec->image_id) {
5760 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5761
Alex Elder2c0d0a12012-10-30 19:40:33 -05005762 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005763 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005764
5765 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005766 * First, see if the format 2 image id file exists, and if
5767 * so, get the image's persistent id from it.
5768 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005769 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5770 rbd_dev->spec->image_name);
5771 if (ret)
5772 return ret;
5773
5774 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005775
5776 /* Response will be an encoded string, which includes a length */
5777
5778 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5779 response = kzalloc(size, GFP_NOIO);
5780 if (!response) {
5781 ret = -ENOMEM;
5782 goto out;
5783 }
5784
Alex Elderc0fba362013-04-25 23:15:08 -05005785 /* If it doesn't exist we'll assume it's a format 1 image */
5786
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005787 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5788 "get_id", NULL, 0,
5789 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005790 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005791 if (ret == -ENOENT) {
5792 image_id = kstrdup("", GFP_KERNEL);
5793 ret = image_id ? 0 : -ENOMEM;
5794 if (!ret)
5795 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005796 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005797 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005798
Alex Elderc0fba362013-04-25 23:15:08 -05005799 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005800 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005801 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005802 if (!ret)
5803 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005804 }
5805
5806 if (!ret) {
5807 rbd_dev->spec->image_id = image_id;
5808 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005809 }
5810out:
5811 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005812 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005813 return ret;
5814}
5815
Alex Elder3abef3b2013-05-13 20:35:37 -05005816/*
5817 * Undo whatever state changes are made by v1 or v2 header info
5818 * call.
5819 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005820static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5821{
5822 struct rbd_image_header *header;
5823
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005824 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005825
5826 /* Free dynamic fields from the header, then zero it out */
5827
5828 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005829 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005830 kfree(header->snap_sizes);
5831 kfree(header->snap_names);
5832 kfree(header->object_prefix);
5833 memset(header, 0, sizeof (*header));
5834}
5835
Alex Elder2df3fac2013-05-06 09:51:30 -05005836static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005837{
5838 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005839
Alex Elder1e130192012-07-03 16:01:19 -05005840 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005841 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005842 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005843
Alex Elder2df3fac2013-05-06 09:51:30 -05005844 /*
5845 * Get the and check features for the image. Currently the
5846 * features are assumed to never change.
5847 */
Alex Elderb1b54022012-07-03 16:01:19 -05005848 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005849 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005850 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005851
Alex Eldercc070d52013-04-21 12:14:45 -05005852 /* If the image supports fancy striping, get its parameters */
5853
5854 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5855 ret = rbd_dev_v2_striping_info(rbd_dev);
5856 if (ret < 0)
5857 goto out_err;
5858 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005859
Ilya Dryomov7e973322017-01-25 18:16:22 +01005860 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5861 ret = rbd_dev_v2_data_pool(rbd_dev);
5862 if (ret)
5863 goto out_err;
5864 }
5865
Ilya Dryomov263423f2017-01-25 18:16:22 +01005866 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005867 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005868
Alex Elder9d475de2012-07-03 16:01:19 -05005869out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005870 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005871 kfree(rbd_dev->header.object_prefix);
5872 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005873 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005874}
5875
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005876/*
5877 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5878 * rbd_dev_image_probe() recursion depth, which means it's also the
5879 * length of the already discovered part of the parent chain.
5880 */
5881static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005882{
Alex Elder2f82ee52012-10-30 19:40:33 -05005883 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005884 int ret;
5885
5886 if (!rbd_dev->parent_spec)
5887 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005888
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005889 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5890 pr_info("parent chain is too long (%d)\n", depth);
5891 ret = -EINVAL;
5892 goto out_err;
5893 }
5894
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005895 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005896 if (!parent) {
5897 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005898 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005899 }
5900
5901 /*
5902 * Images related by parent/child relationships always share
5903 * rbd_client and spec/parent_spec, so bump their refcounts.
5904 */
5905 __rbd_get_client(rbd_dev->rbd_client);
5906 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005907
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005908 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005909 if (ret < 0)
5910 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005911
Alex Elder124afba2013-04-26 15:44:36 -05005912 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005913 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005914 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005915
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005916out_err:
5917 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005918 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005919 return ret;
5920}
5921
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005922static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5923{
5924 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5925 rbd_dev_mapping_clear(rbd_dev);
5926 rbd_free_disk(rbd_dev);
5927 if (!single_major)
5928 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5929}
5930
Ilya Dryomov811c6682016-04-15 16:22:16 +02005931/*
5932 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5933 * upon return.
5934 */
Alex Elder200a6a82013-04-28 23:32:34 -05005935static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005936{
Alex Elder83a06262012-10-30 15:47:17 -05005937 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005938
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005939 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005940
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005941 if (!single_major) {
5942 ret = register_blkdev(0, rbd_dev->name);
5943 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005944 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005945
5946 rbd_dev->major = ret;
5947 rbd_dev->minor = 0;
5948 } else {
5949 rbd_dev->major = rbd_major;
5950 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5951 }
Alex Elder83a06262012-10-30 15:47:17 -05005952
5953 /* Set up the blkdev mapping. */
5954
5955 ret = rbd_init_disk(rbd_dev);
5956 if (ret)
5957 goto err_out_blkdev;
5958
Alex Elderf35a4de2013-05-06 09:51:29 -05005959 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005960 if (ret)
5961 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005962
Alex Elderf35a4de2013-05-06 09:51:29 -05005963 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005964 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005965
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005966 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005967 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005968 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005969
Alex Elder129b79d2013-04-26 15:44:36 -05005970 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005971 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005972 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005973
Alex Elderf35a4de2013-05-06 09:51:29 -05005974err_out_mapping:
5975 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005976err_out_disk:
5977 rbd_free_disk(rbd_dev);
5978err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005979 if (!single_major)
5980 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005981err_out_unlock:
5982 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005983 return ret;
5984}
5985
Alex Elder332bb122013-04-27 09:59:30 -05005986static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5987{
5988 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005989 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005990
5991 /* Record the header object name for this rbd image. */
5992
5993 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005994 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005995 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5996 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005997 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005998 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5999 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05006000
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006001 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05006002}
6003
Alex Elder200a6a82013-04-28 23:32:34 -05006004static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6005{
Alex Elder6fd48b32013-04-28 23:32:34 -05006006 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006007 if (rbd_dev->opts)
6008 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006009 rbd_dev->image_format = 0;
6010 kfree(rbd_dev->spec->image_id);
6011 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05006012}
6013
Alex Eldera30b71b2012-07-10 20:30:11 -05006014/*
6015 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05006016 * device. If this image is the one being mapped (i.e., not a
6017 * parent), initiate a watch on its header object before using that
6018 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05006019 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006020static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05006021{
6022 int ret;
6023
6024 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05006025 * Get the id from the image id object. Unless there's an
6026 * error, rbd_dev->spec->image_id will be filled in with
6027 * a dynamically-allocated string, and rbd_dev->image_format
6028 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05006029 */
6030 ret = rbd_dev_image_id(rbd_dev);
6031 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05006032 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05006033
Alex Elder332bb122013-04-27 09:59:30 -05006034 ret = rbd_dev_header_name(rbd_dev);
6035 if (ret)
6036 goto err_out_format;
6037
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006038 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02006039 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006040 if (ret) {
6041 if (ret == -ENOENT)
6042 pr_info("image %s/%s does not exist\n",
6043 rbd_dev->spec->pool_name,
6044 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006045 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006046 }
Alex Elder1f3ef782013-05-06 17:40:33 -05006047 }
Alex Elderb644de22013-04-27 09:59:31 -05006048
Ilya Dryomova720ae02014-07-23 17:11:19 +04006049 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05006050 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05006051 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05006052
Ilya Dryomov04077592014-07-23 17:11:20 +04006053 /*
6054 * If this image is the one being mapped, we have pool name and
6055 * id, image name and id, and snap name - need to fill snap id.
6056 * Otherwise this is a parent image, identified by pool, image
6057 * and snap ids - need to fill in names for those ids.
6058 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006059 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04006060 ret = rbd_spec_fill_snap_id(rbd_dev);
6061 else
6062 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006063 if (ret) {
6064 if (ret == -ENOENT)
6065 pr_info("snap %s/%s@%s does not exist\n",
6066 rbd_dev->spec->pool_name,
6067 rbd_dev->spec->image_name,
6068 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05006069 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006070 }
Alex Elder9bb81c92013-04-27 09:59:30 -05006071
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006072 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6073 ret = rbd_dev_v2_parent_info(rbd_dev);
6074 if (ret)
6075 goto err_out_probe;
6076
6077 /*
6078 * Need to warn users if this image is the one being
6079 * mapped and has a parent.
6080 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006081 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006082 rbd_warn(rbd_dev,
6083 "WARNING: kernel layering is EXPERIMENTAL!");
6084 }
6085
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006086 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05006087 if (ret)
6088 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05006089
Alex Elder30d60ba2013-05-06 09:51:30 -05006090 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006091 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05006092 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006093
Alex Elder6fd48b32013-04-28 23:32:34 -05006094err_out_probe:
6095 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05006096err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006097 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02006098 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05006099err_out_format:
6100 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05006101 kfree(rbd_dev->spec->image_id);
6102 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05006103 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006104}
6105
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006106static ssize_t do_rbd_add(struct bus_type *bus,
6107 const char *buf,
6108 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006109{
Alex Eldercb8627c2012-07-09 21:04:23 -05006110 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05006111 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05006112 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05006113 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05006114 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006115 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006116
6117 if (!try_module_get(THIS_MODULE))
6118 return -ENODEV;
6119
Alex Eldera725f65e2012-02-02 08:13:30 -06006120 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05006121 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05006122 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006123 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06006124
Alex Elder9d3997f2012-10-25 23:34:42 -05006125 rbdc = rbd_get_client(ceph_opts);
6126 if (IS_ERR(rbdc)) {
6127 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006128 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05006129 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006130
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006131 /* pick the pool */
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006132 rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006133 if (rc < 0) {
6134 if (rc == -ENOENT)
6135 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006136 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006137 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05006138 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05006139
Ilya Dryomovd1475432015-06-22 13:24:48 +03006140 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006141 if (!rbd_dev) {
6142 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05006143 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006144 }
Alex Elderc53d5892012-10-25 23:34:42 -05006145 rbdc = NULL; /* rbd_dev now owns this */
6146 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03006147 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006148
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006149 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
6150 if (!rbd_dev->config_info) {
6151 rc = -ENOMEM;
6152 goto err_out_rbd_dev;
6153 }
6154
Ilya Dryomov811c6682016-04-15 16:22:16 +02006155 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006156 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006157 if (rc < 0) {
6158 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05006159 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006160 }
Alex Elder05fd6f62012-08-29 17:11:07 -05006161
Alex Elder7ce4eef2013-05-06 17:40:33 -05006162 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05006163 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02006164 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05006165
Alex Elderb536f692013-04-28 23:32:34 -05006166 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006167 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006168 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05006169
Ilya Dryomove010dd02017-04-13 12:17:39 +02006170 if (rbd_dev->opts->exclusive) {
6171 rc = rbd_add_acquire_lock(rbd_dev);
6172 if (rc)
6173 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05006174 }
6175
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006176 /* Everything's ready. Announce the disk to the world. */
6177
6178 rc = device_add(&rbd_dev->dev);
6179 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02006180 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006181
6182 add_disk(rbd_dev->disk);
6183 /* see rbd_init_disk() */
6184 blk_put_queue(rbd_dev->disk->queue);
6185
6186 spin_lock(&rbd_dev_list_lock);
6187 list_add_tail(&rbd_dev->node, &rbd_dev_list);
6188 spin_unlock(&rbd_dev_list_lock);
6189
6190 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
6191 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
6192 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006193 rc = count;
6194out:
6195 module_put(THIS_MODULE);
6196 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05006197
Ilya Dryomove010dd02017-04-13 12:17:39 +02006198err_out_image_lock:
6199 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006200err_out_device_setup:
6201 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006202err_out_image_probe:
6203 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05006204err_out_rbd_dev:
6205 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05006206err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05006207 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006208err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05006209 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03006210 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006211 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006212}
6213
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006214static ssize_t rbd_add(struct bus_type *bus,
6215 const char *buf,
6216 size_t count)
6217{
6218 if (single_major)
6219 return -EINVAL;
6220
6221 return do_rbd_add(bus, buf, count);
6222}
6223
6224static ssize_t rbd_add_single_major(struct bus_type *bus,
6225 const char *buf,
6226 size_t count)
6227{
6228 return do_rbd_add(bus, buf, count);
6229}
6230
Alex Elder05a46af2013-04-26 15:44:36 -05006231static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6232{
Alex Elderad945fc2013-04-26 15:44:36 -05006233 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05006234 struct rbd_device *first = rbd_dev;
6235 struct rbd_device *second = first->parent;
6236 struct rbd_device *third;
6237
6238 /*
6239 * Follow to the parent with no grandparent and
6240 * remove it.
6241 */
6242 while (second && (third = second->parent)) {
6243 first = second;
6244 second = third;
6245 }
Alex Elderad945fc2013-04-26 15:44:36 -05006246 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006247 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006248 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05006249 first->parent = NULL;
6250 first->parent_overlap = 0;
6251
6252 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05006253 rbd_spec_put(first->parent_spec);
6254 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05006255 }
6256}
6257
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006258static ssize_t do_rbd_remove(struct bus_type *bus,
6259 const char *buf,
6260 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006261{
6262 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05006263 struct list_head *tmp;
6264 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02006265 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05006266 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02006267 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05006268 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006269
Mike Christie0276dca2016-08-18 18:38:45 +02006270 dev_id = -1;
6271 opt_buf[0] = '\0';
6272 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6273 if (dev_id < 0) {
6274 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006275 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02006276 }
6277 if (opt_buf[0] != '\0') {
6278 if (!strcmp(opt_buf, "force")) {
6279 force = true;
6280 } else {
6281 pr_err("bad remove option at '%s'\n", opt_buf);
6282 return -EINVAL;
6283 }
6284 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006285
Alex Elder751cc0e2013-05-31 15:17:01 -05006286 ret = -ENOENT;
6287 spin_lock(&rbd_dev_list_lock);
6288 list_for_each(tmp, &rbd_dev_list) {
6289 rbd_dev = list_entry(tmp, struct rbd_device, node);
6290 if (rbd_dev->dev_id == dev_id) {
6291 ret = 0;
6292 break;
6293 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006294 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006295 if (!ret) {
6296 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006297 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006298 ret = -EBUSY;
6299 else
Alex Elder82a442d2013-05-31 17:40:44 -05006300 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6301 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05006302 spin_unlock_irq(&rbd_dev->lock);
6303 }
6304 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05006305 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006306 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006307
Mike Christie0276dca2016-08-18 18:38:45 +02006308 if (force) {
6309 /*
6310 * Prevent new IO from being queued and wait for existing
6311 * IO to complete/fail.
6312 */
6313 blk_mq_freeze_queue(rbd_dev->disk->queue);
6314 blk_set_queue_dying(rbd_dev->disk->queue);
6315 }
6316
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006317 del_gendisk(rbd_dev->disk);
6318 spin_lock(&rbd_dev_list_lock);
6319 list_del_init(&rbd_dev->node);
6320 spin_unlock(&rbd_dev_list_lock);
6321 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006322
Ilya Dryomove010dd02017-04-13 12:17:39 +02006323 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006324 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006325 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006326 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006327 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006328}
6329
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006330static ssize_t rbd_remove(struct bus_type *bus,
6331 const char *buf,
6332 size_t count)
6333{
6334 if (single_major)
6335 return -EINVAL;
6336
6337 return do_rbd_remove(bus, buf, count);
6338}
6339
6340static ssize_t rbd_remove_single_major(struct bus_type *bus,
6341 const char *buf,
6342 size_t count)
6343{
6344 return do_rbd_remove(bus, buf, count);
6345}
6346
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006347/*
6348 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006349 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006350 */
6351static int rbd_sysfs_init(void)
6352{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006353 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006354
Alex Elderfed4c142012-02-07 12:03:36 -06006355 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006356 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006357 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006358
Alex Elderfed4c142012-02-07 12:03:36 -06006359 ret = bus_register(&rbd_bus_type);
6360 if (ret < 0)
6361 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006362
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006363 return ret;
6364}
6365
6366static void rbd_sysfs_cleanup(void)
6367{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006368 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006369 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006370}
6371
Alex Elder1c2a9df2013-05-01 12:43:03 -05006372static int rbd_slab_init(void)
6373{
6374 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006375 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006376 if (!rbd_img_request_cache)
6377 return -ENOMEM;
6378
6379 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006380 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006381 if (!rbd_obj_request_cache)
6382 goto out_err;
6383
NeilBrownf856dc32017-06-18 14:38:58 +10006384 rbd_assert(!rbd_bio_clone);
6385 rbd_bio_clone = bioset_create(BIO_POOL_SIZE, 0, 0);
6386 if (!rbd_bio_clone)
6387 goto out_err_clone;
6388
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006389 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006390
NeilBrownf856dc32017-06-18 14:38:58 +10006391out_err_clone:
6392 kmem_cache_destroy(rbd_obj_request_cache);
6393 rbd_obj_request_cache = NULL;
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006394out_err:
Alex Elder868311b2013-05-01 12:43:03 -05006395 kmem_cache_destroy(rbd_img_request_cache);
6396 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006397 return -ENOMEM;
6398}
6399
6400static void rbd_slab_exit(void)
6401{
Alex Elder868311b2013-05-01 12:43:03 -05006402 rbd_assert(rbd_obj_request_cache);
6403 kmem_cache_destroy(rbd_obj_request_cache);
6404 rbd_obj_request_cache = NULL;
6405
Alex Elder1c2a9df2013-05-01 12:43:03 -05006406 rbd_assert(rbd_img_request_cache);
6407 kmem_cache_destroy(rbd_img_request_cache);
6408 rbd_img_request_cache = NULL;
NeilBrownf856dc32017-06-18 14:38:58 +10006409
6410 rbd_assert(rbd_bio_clone);
6411 bioset_free(rbd_bio_clone);
6412 rbd_bio_clone = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006413}
6414
Alex Eldercc344fa2013-02-19 12:25:56 -06006415static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006416{
6417 int rc;
6418
Alex Elder1e32d342013-01-30 11:13:33 -06006419 if (!libceph_compatible(NULL)) {
6420 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006421 return -EINVAL;
6422 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006423
Alex Elder1c2a9df2013-05-01 12:43:03 -05006424 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006425 if (rc)
6426 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006427
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006428 /*
6429 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006430 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006431 */
6432 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6433 if (!rbd_wq) {
6434 rc = -ENOMEM;
6435 goto err_out_slab;
6436 }
6437
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006438 if (single_major) {
6439 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6440 if (rbd_major < 0) {
6441 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006442 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006443 }
6444 }
6445
Alex Elder1c2a9df2013-05-01 12:43:03 -05006446 rc = rbd_sysfs_init();
6447 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006448 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006449
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006450 if (single_major)
6451 pr_info("loaded (major %d)\n", rbd_major);
6452 else
6453 pr_info("loaded\n");
6454
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006455 return 0;
6456
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006457err_out_blkdev:
6458 if (single_major)
6459 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006460err_out_wq:
6461 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006462err_out_slab:
6463 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006464 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006465}
6466
Alex Eldercc344fa2013-02-19 12:25:56 -06006467static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006468{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006469 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006470 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006471 if (single_major)
6472 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006473 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006474 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006475}
6476
6477module_init(rbd_init);
6478module_exit(rbd_exit);
6479
Alex Elderd552c612013-05-31 20:13:09 -05006480MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006481MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6482MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006483/* following authorship retained from original osdblk.c */
6484MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6485
Ilya Dryomov90da2582013-12-13 15:28:56 +02006486MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006487MODULE_LICENSE("GPL");