blob: f4b1b91e6d4dee0fc07d996557328c78a49c23f5 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
64 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
65 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
118#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100119#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100120
Ilya Dryomoved95b212016-08-12 16:40:02 +0200121#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
122 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100123 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100124 RBD_FEATURE_DATA_POOL | \
125 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700126
127/* Features supported by this (client software) implementation. */
128
Alex Elder770eba62012-10-25 23:34:40 -0500129#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700130
Alex Elder81a89792012-02-02 08:13:30 -0600131/*
132 * An RBD device name will be "rbd#", where the "rbd" comes from
133 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600134 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700135#define DEV_NAME_LEN 32
136
137/*
138 * block device image metadata (in-memory version)
139 */
140struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500141 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500142 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500144 u64 stripe_unit;
145 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100146 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500147 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700148
Alex Elderf84344f2012-08-31 17:29:51 -0500149 /* The remaining fields need to be updated occasionally */
150 u64 image_size;
151 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500152 char *snap_names; /* format 1 only */
153 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700154};
155
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500156/*
157 * An rbd image specification.
158 *
159 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500160 * identify an image. Each rbd_dev structure includes a pointer to
161 * an rbd_spec structure that encapsulates this identity.
162 *
163 * Each of the id's in an rbd_spec has an associated name. For a
164 * user-mapped image, the names are supplied and the id's associated
165 * with them are looked up. For a layered image, a parent image is
166 * defined by the tuple, and the names are looked up.
167 *
168 * An rbd_dev structure contains a parent_spec pointer which is
169 * non-null if the image it represents is a child in a layered
170 * image. This pointer will refer to the rbd_spec structure used
171 * by the parent rbd_dev for its own identity (i.e., the structure
172 * is shared between the parent and child).
173 *
174 * Since these structures are populated once, during the discovery
175 * phase of image construction, they are effectively immutable so
176 * we make no effort to synchronize access to them.
177 *
178 * Note that code herein does not assume the image name is known (it
179 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500180 */
181struct rbd_spec {
182 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500183 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500184
Alex Elderecb4dc22013-04-26 09:43:47 -0500185 const char *image_id;
186 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500187
188 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500189 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500190
191 struct kref kref;
192};
193
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700194/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600195 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700196 */
197struct rbd_client {
198 struct ceph_client *client;
199 struct kref kref;
200 struct list_head node;
201};
202
Alex Elderbf0d5f502012-11-22 00:00:08 -0600203struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204
Alex Elder9969ebc2013-01-18 12:31:10 -0600205enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100206 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100207 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100208 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100209 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600210};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800212enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100213 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800214 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800215 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800216};
217
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100218/*
219 * Writes go through the following state machine to deal with
220 * layering:
221 *
222 * need copyup
223 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
224 * | ^ |
225 * v \------------------------------/
226 * done
227 * ^
228 * |
229 * RBD_OBJ_WRITE_FLAT
230 *
231 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
232 * there is a parent or not.
233 */
234enum rbd_obj_write_state {
235 RBD_OBJ_WRITE_FLAT = 1,
236 RBD_OBJ_WRITE_GUARD,
237 RBD_OBJ_WRITE_COPYUP,
Alex Elder926f9b32013-02-11 12:33:24 -0600238};
239
Alex Elderbf0d5f502012-11-22 00:00:08 -0600240struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100241 struct ceph_object_extent ex;
Alex Elderc5b5ef62013-02-11 12:33:24 -0600242 union {
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100243 bool tried_parent; /* for reads */
244 enum rbd_obj_write_state write_state; /* for writes */
245 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600246
Ilya Dryomov51c35092018-01-29 14:04:08 +0100247 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100248 struct ceph_file_extent *img_extents;
249 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600250
Alex Elder788e2df2013-01-17 12:25:27 -0600251 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100252 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600253 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100254 struct ceph_bvec_iter bvec_pos;
255 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100256 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600257 };
258 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100259 struct bio_vec *copyup_bvecs;
260 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600261
262 struct ceph_osd_request *osd_req;
263
264 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800265 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600266
Alex Elderbf0d5f502012-11-22 00:00:08 -0600267 struct kref kref;
268};
269
Alex Elder0c425242013-02-08 09:55:49 -0600270enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600271 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600272 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600273};
274
Alex Elderbf0d5f502012-11-22 00:00:08 -0600275struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100277 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100278 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600279 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600280 union {
Alex Elder9849e982013-01-24 16:13:36 -0600281 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600282 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600283 };
284 union {
285 struct request *rq; /* block request */
286 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600287 };
Ilya Dryomov15961b42018-02-01 11:50:47 +0100288 spinlock_t completion_lock;
Alex Elder55f27e02013-04-10 12:34:25 -0500289 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600290 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600291
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100292 struct list_head object_extents; /* obj_req.ex structs */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293 u32 obj_request_count;
Ilya Dryomov7114eda2018-02-01 11:50:47 +0100294 u32 pending_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600295
296 struct kref kref;
297};
298
299#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100300 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600301#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100302 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600303
Ilya Dryomov99d16942016-08-12 16:11:41 +0200304enum rbd_watch_state {
305 RBD_WATCH_STATE_UNREGISTERED,
306 RBD_WATCH_STATE_REGISTERED,
307 RBD_WATCH_STATE_ERROR,
308};
309
Ilya Dryomoved95b212016-08-12 16:40:02 +0200310enum rbd_lock_state {
311 RBD_LOCK_STATE_UNLOCKED,
312 RBD_LOCK_STATE_LOCKED,
313 RBD_LOCK_STATE_RELEASING,
314};
315
316/* WatchNotify::ClientId */
317struct rbd_client_id {
318 u64 gid;
319 u64 handle;
320};
321
Alex Elderf84344f2012-08-31 17:29:51 -0500322struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500323 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500324 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500325};
326
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327/*
328 * a single device
329 */
330struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500331 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332
333 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200334 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700335 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336
Alex Eldera30b71b2012-07-10 20:30:11 -0500337 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700338 struct rbd_client *rbd_client;
339
340 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
341
Alex Elderb82d1672013-01-14 12:43:31 -0600342 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700343
344 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600345 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500346 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300347 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200348 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200350 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200351 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500352
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200353 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600354
Ilya Dryomov99d16942016-08-12 16:11:41 +0200355 struct mutex watch_mutex;
356 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200357 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200358 u64 watch_cookie;
359 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700360
Ilya Dryomoved95b212016-08-12 16:40:02 +0200361 struct rw_semaphore lock_rwsem;
362 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200363 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200364 struct rbd_client_id owner_cid;
365 struct work_struct acquired_lock_work;
366 struct work_struct released_lock_work;
367 struct delayed_work lock_dwork;
368 struct work_struct unlock_work;
369 wait_queue_head_t lock_waitq;
370
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200371 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700372
Alex Elder86b00e02012-10-25 23:34:42 -0500373 struct rbd_spec *parent_spec;
374 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500375 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500376 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500377
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100378 /* Block layer tags. */
379 struct blk_mq_tag_set tag_set;
380
Josh Durginc6666012011-11-21 17:11:12 -0800381 /* protects updating the header */
382 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500383
384 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700385
386 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800387
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800388 /* sysfs related */
389 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600390 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800391};
392
Alex Elderb82d1672013-01-14 12:43:31 -0600393/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200394 * Flag bits for rbd_dev->flags:
395 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
396 * by rbd_dev->lock
397 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600398 */
Alex Elder6d292902013-01-14 12:43:31 -0600399enum rbd_dev_flags {
400 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600401 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200402 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600403};
404
Alex Eldercfbf6372013-05-31 17:40:45 -0500405static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600406
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700407static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600408static DEFINE_SPINLOCK(rbd_dev_list_lock);
409
Alex Elder432b8582012-01-29 13:57:44 -0600410static LIST_HEAD(rbd_client_list); /* clients */
411static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700412
Alex Elder78c2a442013-05-01 12:43:04 -0500413/* Slab caches for frequently-allocated structures */
414
Alex Elder1c2a9df2013-05-01 12:43:03 -0500415static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500416static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500417
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200418static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200419static DEFINE_IDA(rbd_dev_id_ida);
420
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400421static struct workqueue_struct *rbd_wq;
422
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200423/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100424 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200425 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100426static bool single_major = true;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200427module_param(single_major, bool, S_IRUGO);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100428MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200429
Alex Elderf0f8cef2012-01-29 13:57:44 -0600430static ssize_t rbd_add(struct bus_type *bus, const char *buf,
431 size_t count);
432static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
433 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200434static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
435 size_t count);
436static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
437 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200438static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600439
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200440static int rbd_dev_id_to_minor(int dev_id)
441{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200442 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200443}
444
445static int minor_to_rbd_dev_id(int minor)
446{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200447 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200448}
449
Ilya Dryomoved95b212016-08-12 16:40:02 +0200450static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
451{
452 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
453 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
454}
455
456static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
457{
458 bool is_lock_owner;
459
460 down_read(&rbd_dev->lock_rwsem);
461 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
462 up_read(&rbd_dev->lock_rwsem);
463 return is_lock_owner;
464}
465
Ilya Dryomov8767b292017-03-02 19:56:57 +0100466static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
467{
468 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
469}
470
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700471static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
472static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200473static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
474static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Ilya Dryomov8767b292017-03-02 19:56:57 +0100475static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700476
477static struct attribute *rbd_bus_attrs[] = {
478 &bus_attr_add.attr,
479 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200480 &bus_attr_add_single_major.attr,
481 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100482 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700483 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600484};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200485
486static umode_t rbd_bus_is_visible(struct kobject *kobj,
487 struct attribute *attr, int index)
488{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200489 if (!single_major &&
490 (attr == &bus_attr_add_single_major.attr ||
491 attr == &bus_attr_remove_single_major.attr))
492 return 0;
493
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200494 return attr->mode;
495}
496
497static const struct attribute_group rbd_bus_group = {
498 .attrs = rbd_bus_attrs,
499 .is_visible = rbd_bus_is_visible,
500};
501__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600502
503static struct bus_type rbd_bus_type = {
504 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700505 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600506};
507
508static void rbd_root_dev_release(struct device *dev)
509{
510}
511
512static struct device rbd_root_dev = {
513 .init_name = "rbd",
514 .release = rbd_root_dev_release,
515};
516
Alex Elder06ecc6c2012-11-01 10:17:15 -0500517static __printf(2, 3)
518void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
519{
520 struct va_format vaf;
521 va_list args;
522
523 va_start(args, fmt);
524 vaf.fmt = fmt;
525 vaf.va = &args;
526
527 if (!rbd_dev)
528 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
529 else if (rbd_dev->disk)
530 printk(KERN_WARNING "%s: %s: %pV\n",
531 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
532 else if (rbd_dev->spec && rbd_dev->spec->image_name)
533 printk(KERN_WARNING "%s: image %s: %pV\n",
534 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
535 else if (rbd_dev->spec && rbd_dev->spec->image_id)
536 printk(KERN_WARNING "%s: id %s: %pV\n",
537 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
538 else /* punt */
539 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
540 RBD_DRV_NAME, rbd_dev, &vaf);
541 va_end(args);
542}
543
Alex Elderaafb2302012-09-06 16:00:54 -0500544#ifdef RBD_DEBUG
545#define rbd_assert(expr) \
546 if (unlikely(!(expr))) { \
547 printk(KERN_ERR "\nAssertion failure in %s() " \
548 "at line %d:\n\n" \
549 "\trbd_assert(%s);\n\n", \
550 __func__, __LINE__, #expr); \
551 BUG(); \
552 }
553#else /* !RBD_DEBUG */
554# define rbd_assert(expr) ((void) 0)
555#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800556
Alex Elder05a46af2013-04-26 15:44:36 -0500557static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600558
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500559static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500560static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400561static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400562static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500563static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
564 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500565static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
566 u8 *order, u64 *snap_size);
567static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
568 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700569
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570static int rbd_open(struct block_device *bdev, fmode_t mode)
571{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600572 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600573 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700574
Alex Eldera14ea262013-02-05 13:23:12 -0600575 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600576 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
577 removing = true;
578 else
579 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600580 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600581 if (removing)
582 return -ENOENT;
583
Alex Elderc3e946c2012-11-16 09:29:16 -0600584 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700585
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586 return 0;
587}
588
Al Virodb2a1442013-05-05 21:52:57 -0400589static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800590{
591 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600592 unsigned long open_count_before;
593
Alex Eldera14ea262013-02-05 13:23:12 -0600594 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600595 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600596 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600597 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800598
Alex Elderc3e946c2012-11-16 09:29:16 -0600599 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800600}
601
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800602static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
603{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200604 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800605
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200606 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800607 return -EFAULT;
608
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200609 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800610 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
611 return -EROFS;
612
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200613 /* Let blkdev_roset() handle it */
614 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800615}
616
617static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
618 unsigned int cmd, unsigned long arg)
619{
620 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200621 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800622
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800623 switch (cmd) {
624 case BLKROSET:
625 ret = rbd_ioctl_set_ro(rbd_dev, arg);
626 break;
627 default:
628 ret = -ENOTTY;
629 }
630
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800631 return ret;
632}
633
634#ifdef CONFIG_COMPAT
635static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
636 unsigned int cmd, unsigned long arg)
637{
638 return rbd_ioctl(bdev, mode, cmd, arg);
639}
640#endif /* CONFIG_COMPAT */
641
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642static const struct block_device_operations rbd_bd_ops = {
643 .owner = THIS_MODULE,
644 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800645 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800646 .ioctl = rbd_ioctl,
647#ifdef CONFIG_COMPAT
648 .compat_ioctl = rbd_compat_ioctl,
649#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650};
651
652/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500653 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500654 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700655 */
Alex Elderf8c38922012-08-10 13:12:07 -0700656static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657{
658 struct rbd_client *rbdc;
659 int ret = -ENOMEM;
660
Alex Elder37206ee2013-02-20 17:32:08 -0600661 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
663 if (!rbdc)
664 goto out_opt;
665
666 kref_init(&rbdc->kref);
667 INIT_LIST_HEAD(&rbdc->node);
668
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100669 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700670 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500671 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500672 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673
674 ret = ceph_open_session(rbdc->client);
675 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500676 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677
Alex Elder432b8582012-01-29 13:57:44 -0600678 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600680 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681
Alex Elder37206ee2013-02-20 17:32:08 -0600682 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600683
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700684 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500685out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500687out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700688 kfree(rbdc);
689out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500690 if (ceph_opts)
691 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600692 dout("%s: error %d\n", __func__, ret);
693
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400694 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695}
696
Alex Elder2f82ee52012-10-30 19:40:33 -0500697static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
698{
699 kref_get(&rbdc->kref);
700
701 return rbdc;
702}
703
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700705 * Find a ceph client with specific addr and configuration. If
706 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700708static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709{
710 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700711 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712
Alex Elder43ae4702012-07-03 16:01:18 -0500713 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700714 return NULL;
715
Alex Elder1f7ba332012-08-10 13:12:07 -0700716 spin_lock(&rbd_client_list_lock);
717 list_for_each_entry(client_node, &rbd_client_list, node) {
718 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500719 __rbd_get_client(client_node);
720
Alex Elder1f7ba332012-08-10 13:12:07 -0700721 found = true;
722 break;
723 }
724 }
725 spin_unlock(&rbd_client_list_lock);
726
727 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700728}
729
730/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300731 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700732 */
733enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300734 Opt_queue_depth,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700735 Opt_last_int,
736 /* int args above */
737 Opt_last_string,
738 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700739 Opt_read_only,
740 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200741 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200742 Opt_exclusive,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300743 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700744};
745
Alex Elder43ae4702012-07-03 16:01:18 -0500746static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300747 {Opt_queue_depth, "queue_depth=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700748 /* int args above */
749 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500750 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700751 {Opt_read_only, "ro"}, /* Alternate spelling */
752 {Opt_read_write, "read_write"},
753 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200754 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200755 {Opt_exclusive, "exclusive"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300756 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700757};
758
Alex Elder98571b52013-01-20 14:44:42 -0600759struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300760 int queue_depth;
Alex Elder98571b52013-01-20 14:44:42 -0600761 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200762 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200763 bool exclusive;
Alex Elder98571b52013-01-20 14:44:42 -0600764};
765
Ilya Dryomovb5584182015-06-23 16:21:19 +0300766#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Alex Elder98571b52013-01-20 14:44:42 -0600767#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200768#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200769#define RBD_EXCLUSIVE_DEFAULT false
Alex Elder98571b52013-01-20 14:44:42 -0600770
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700771static int parse_rbd_opts_token(char *c, void *private)
772{
Alex Elder43ae4702012-07-03 16:01:18 -0500773 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700774 substring_t argstr[MAX_OPT_ARGS];
775 int token, intval, ret;
776
Alex Elder43ae4702012-07-03 16:01:18 -0500777 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700778 if (token < Opt_last_int) {
779 ret = match_int(&argstr[0], &intval);
780 if (ret < 0) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300781 pr_err("bad mount option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700782 return ret;
783 }
784 dout("got int token %d val %d\n", token, intval);
785 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300786 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700787 } else {
788 dout("got token %d\n", token);
789 }
790
791 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300792 case Opt_queue_depth:
793 if (intval < 1) {
794 pr_err("queue_depth out of range\n");
795 return -EINVAL;
796 }
797 rbd_opts->queue_depth = intval;
798 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700799 case Opt_read_only:
800 rbd_opts->read_only = true;
801 break;
802 case Opt_read_write:
803 rbd_opts->read_only = false;
804 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200805 case Opt_lock_on_read:
806 rbd_opts->lock_on_read = true;
807 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200808 case Opt_exclusive:
809 rbd_opts->exclusive = true;
810 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700811 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300812 /* libceph prints "bad option" msg */
813 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700814 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300815
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700816 return 0;
817}
818
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800819static char* obj_op_name(enum obj_operation_type op_type)
820{
821 switch (op_type) {
822 case OBJ_OP_READ:
823 return "read";
824 case OBJ_OP_WRITE:
825 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800826 case OBJ_OP_DISCARD:
827 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800828 default:
829 return "???";
830 }
831}
832
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700833/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600835 *
Alex Elder432b8582012-01-29 13:57:44 -0600836 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700837 */
838static void rbd_client_release(struct kref *kref)
839{
840 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
841
Alex Elder37206ee2013-02-20 17:32:08 -0600842 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500843 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700844 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500845 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846
847 ceph_destroy_client(rbdc->client);
848 kfree(rbdc);
849}
850
851/*
852 * Drop reference to ceph client node. If it's not referenced anymore, release
853 * it.
854 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500855static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700856{
Alex Elderc53d5892012-10-25 23:34:42 -0500857 if (rbdc)
858 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859}
860
Ilya Dryomovdd435852018-02-22 13:43:24 +0100861static int wait_for_latest_osdmap(struct ceph_client *client)
862{
863 u64 newest_epoch;
864 int ret;
865
866 ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
867 if (ret)
868 return ret;
869
870 if (client->osdc.osdmap->epoch >= newest_epoch)
871 return 0;
872
873 ceph_osdc_maybe_request_map(&client->osdc);
874 return ceph_monc_wait_osdmap(&client->monc, newest_epoch,
875 client->options->mount_timeout);
876}
877
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100878/*
879 * Get a ceph client with specific addr and configuration, if one does
880 * not exist create it. Either way, ceph_opts is consumed by this
881 * function.
882 */
883static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
884{
885 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100886 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100887
888 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
889 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100890 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100891 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100892
893 /*
894 * Using an existing client. Make sure ->pg_pools is up to
895 * date before we look up the pool id in do_rbd_add().
896 */
897 ret = wait_for_latest_osdmap(rbdc->client);
898 if (ret) {
899 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
900 rbd_put_client(rbdc);
901 rbdc = ERR_PTR(ret);
902 }
903 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100904 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100905 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100906 mutex_unlock(&client_mutex);
907
908 return rbdc;
909}
910
Alex Eldera30b71b2012-07-10 20:30:11 -0500911static bool rbd_image_format_valid(u32 image_format)
912{
913 return image_format == 1 || image_format == 2;
914}
915
Alex Elder8e94af82012-07-25 09:32:40 -0500916static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
917{
Alex Elder103a1502012-08-02 11:29:45 -0500918 size_t size;
919 u32 snap_count;
920
921 /* The header has to start with the magic rbd header text */
922 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
923 return false;
924
Alex Elderdb2388b2012-10-20 22:17:27 -0500925 /* The bio layer requires at least sector-sized I/O */
926
927 if (ondisk->options.order < SECTOR_SHIFT)
928 return false;
929
930 /* If we use u64 in a few spots we may be able to loosen this */
931
932 if (ondisk->options.order > 8 * sizeof (int) - 1)
933 return false;
934
Alex Elder103a1502012-08-02 11:29:45 -0500935 /*
936 * The size of a snapshot header has to fit in a size_t, and
937 * that limits the number of snapshots.
938 */
939 snap_count = le32_to_cpu(ondisk->snap_count);
940 size = SIZE_MAX - sizeof (struct ceph_snap_context);
941 if (snap_count > size / sizeof (__le64))
942 return false;
943
944 /*
945 * Not only that, but the size of the entire the snapshot
946 * header must also be representable in a size_t.
947 */
948 size -= snap_count * sizeof (__le64);
949 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
950 return false;
951
952 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500953}
954
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700955/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100956 * returns the size of an object in the image
957 */
958static u32 rbd_obj_bytes(struct rbd_image_header *header)
959{
960 return 1U << header->obj_order;
961}
962
Ilya Dryomov263423f2017-01-25 18:16:22 +0100963static void rbd_init_layout(struct rbd_device *rbd_dev)
964{
965 if (rbd_dev->header.stripe_unit == 0 ||
966 rbd_dev->header.stripe_count == 0) {
967 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
968 rbd_dev->header.stripe_count = 1;
969 }
970
971 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
972 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
973 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +0100974 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
975 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +0100976 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
977}
978
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100979/*
Alex Elderbb23e372013-05-06 09:51:29 -0500980 * Fill an rbd image header with information from the given format 1
981 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700982 */
Alex Elder662518b2013-05-06 09:51:29 -0500983static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -0500984 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985{
Alex Elder662518b2013-05-06 09:51:29 -0500986 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -0500987 bool first_time = header->object_prefix == NULL;
988 struct ceph_snap_context *snapc;
989 char *object_prefix = NULL;
990 char *snap_names = NULL;
991 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500992 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -0500993 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -0500994 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995
Alex Elderbb23e372013-05-06 09:51:29 -0500996 /* Allocate this now to avoid having to handle failure below */
997
998 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +0100999 object_prefix = kstrndup(ondisk->object_prefix,
1000 sizeof(ondisk->object_prefix),
1001 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001002 if (!object_prefix)
1003 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001004 }
1005
1006 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001007
Alex Elder103a1502012-08-02 11:29:45 -05001008 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001009 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1010 if (!snapc)
1011 goto out_err;
1012 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001013 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001014 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001015 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1016
Alex Elderbb23e372013-05-06 09:51:29 -05001017 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001018
Alex Elderbb23e372013-05-06 09:51:29 -05001019 if (snap_names_len > (u64)SIZE_MAX)
1020 goto out_2big;
1021 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1022 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001023 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001024
1025 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001026 snap_sizes = kmalloc_array(snap_count,
1027 sizeof(*header->snap_sizes),
1028 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001029 if (!snap_sizes)
1030 goto out_err;
1031
Alex Elderf785cc12012-08-23 23:22:06 -05001032 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001033 * Copy the names, and fill in each snapshot's id
1034 * and size.
1035 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001036 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001037 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001038 * snap_names_len bytes beyond the end of the
1039 * snapshot id array, this memcpy() is safe.
1040 */
Alex Elderbb23e372013-05-06 09:51:29 -05001041 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1042 snaps = ondisk->snaps;
1043 for (i = 0; i < snap_count; i++) {
1044 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1045 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1046 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001047 }
Alex Elder849b4262012-07-09 21:04:24 -05001048
Alex Elderbb23e372013-05-06 09:51:29 -05001049 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001050
Alex Elderbb23e372013-05-06 09:51:29 -05001051 if (first_time) {
1052 header->object_prefix = object_prefix;
1053 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001054 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001055 } else {
1056 ceph_put_snap_context(header->snapc);
1057 kfree(header->snap_names);
1058 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001059 }
1060
1061 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001062
Alex Elderf84344f2012-08-31 17:29:51 -05001063 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001064 header->snapc = snapc;
1065 header->snap_names = snap_names;
1066 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001067
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001068 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001069out_2big:
1070 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001071out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001072 kfree(snap_sizes);
1073 kfree(snap_names);
1074 ceph_put_snap_context(snapc);
1075 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001076
Alex Elderbb23e372013-05-06 09:51:29 -05001077 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001078}
1079
Alex Elder9682fc62013-04-30 00:44:33 -05001080static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1081{
1082 const char *snap_name;
1083
1084 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1085
1086 /* Skip over names until we find the one we are looking for */
1087
1088 snap_name = rbd_dev->header.snap_names;
1089 while (which--)
1090 snap_name += strlen(snap_name) + 1;
1091
1092 return kstrdup(snap_name, GFP_KERNEL);
1093}
1094
Alex Elder30d1cff2013-05-01 12:43:03 -05001095/*
1096 * Snapshot id comparison function for use with qsort()/bsearch().
1097 * Note that result is for snapshots in *descending* order.
1098 */
1099static int snapid_compare_reverse(const void *s1, const void *s2)
1100{
1101 u64 snap_id1 = *(u64 *)s1;
1102 u64 snap_id2 = *(u64 *)s2;
1103
1104 if (snap_id1 < snap_id2)
1105 return 1;
1106 return snap_id1 == snap_id2 ? 0 : -1;
1107}
1108
1109/*
1110 * Search a snapshot context to see if the given snapshot id is
1111 * present.
1112 *
1113 * Returns the position of the snapshot id in the array if it's found,
1114 * or BAD_SNAP_INDEX otherwise.
1115 *
1116 * Note: The snapshot array is in kept sorted (by the osd) in
1117 * reverse order, highest snapshot id first.
1118 */
Alex Elder9682fc62013-04-30 00:44:33 -05001119static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1120{
1121 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001122 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001123
Alex Elder30d1cff2013-05-01 12:43:03 -05001124 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1125 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001126
Alex Elder30d1cff2013-05-01 12:43:03 -05001127 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001128}
1129
Alex Elder2ad3d712013-04-30 00:44:33 -05001130static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1131 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001132{
1133 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001134 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001135
1136 which = rbd_dev_snap_index(rbd_dev, snap_id);
1137 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001138 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001139
Josh Durginda6a6b62013-09-04 17:57:31 -07001140 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1141 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001142}
1143
Alex Elder9e15b772012-10-30 19:40:33 -05001144static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1145{
Alex Elder9e15b772012-10-30 19:40:33 -05001146 if (snap_id == CEPH_NOSNAP)
1147 return RBD_SNAP_HEAD_NAME;
1148
Alex Elder54cac612013-04-30 00:44:33 -05001149 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1150 if (rbd_dev->image_format == 1)
1151 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001152
Alex Elder54cac612013-04-30 00:44:33 -05001153 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001154}
1155
Alex Elder2ad3d712013-04-30 00:44:33 -05001156static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1157 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001158{
Alex Elder2ad3d712013-04-30 00:44:33 -05001159 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1160 if (snap_id == CEPH_NOSNAP) {
1161 *snap_size = rbd_dev->header.image_size;
1162 } else if (rbd_dev->image_format == 1) {
1163 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001164
Alex Elder2ad3d712013-04-30 00:44:33 -05001165 which = rbd_dev_snap_index(rbd_dev, snap_id);
1166 if (which == BAD_SNAP_INDEX)
1167 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001168
Alex Elder2ad3d712013-04-30 00:44:33 -05001169 *snap_size = rbd_dev->header.snap_sizes[which];
1170 } else {
1171 u64 size = 0;
1172 int ret;
1173
1174 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1175 if (ret)
1176 return ret;
1177
1178 *snap_size = size;
1179 }
1180 return 0;
1181}
1182
1183static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1184 u64 *snap_features)
1185{
1186 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1187 if (snap_id == CEPH_NOSNAP) {
1188 *snap_features = rbd_dev->header.features;
1189 } else if (rbd_dev->image_format == 1) {
1190 *snap_features = 0; /* No features for format 1 */
1191 } else {
1192 u64 features = 0;
1193 int ret;
1194
1195 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1196 if (ret)
1197 return ret;
1198
1199 *snap_features = features;
1200 }
1201 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001202}
1203
Alex Elderd1cf5782013-04-27 09:59:30 -05001204static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001206 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001207 u64 size = 0;
1208 u64 features = 0;
1209 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001210
Alex Elder2ad3d712013-04-30 00:44:33 -05001211 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1212 if (ret)
1213 return ret;
1214 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1215 if (ret)
1216 return ret;
1217
1218 rbd_dev->mapping.size = size;
1219 rbd_dev->mapping.features = features;
1220
Alex Elder8b0241f2013-04-25 23:15:08 -05001221 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222}
1223
Alex Elderd1cf5782013-04-27 09:59:30 -05001224static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1225{
1226 rbd_dev->mapping.size = 0;
1227 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001228}
1229
Ilya Dryomov5359a172018-01-20 10:30:10 +01001230static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001231{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001232 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001233 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234
Ilya Dryomov5359a172018-01-20 10:30:10 +01001235 buf = bvec_kmap_irq(bv, &flags);
1236 memset(buf, 0, bv->bv_len);
1237 flush_dcache_page(bv->bv_page);
1238 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239}
1240
Ilya Dryomov5359a172018-01-20 10:30:10 +01001241static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001242{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001243 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001244
Ilya Dryomov5359a172018-01-20 10:30:10 +01001245 ceph_bio_iter_advance(&it, off);
1246 ceph_bio_iter_advance_step(&it, bytes, ({
1247 zero_bvec(&bv);
1248 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001249}
1250
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001251static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001252{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001253 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001254
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001255 ceph_bvec_iter_advance(&it, off);
1256 ceph_bvec_iter_advance_step(&it, bytes, ({
1257 zero_bvec(&bv);
1258 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001259}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001260
Alex Elderf7760da2012-10-20 22:17:27 -05001261/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001262 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001263 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001264 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001265 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001266 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001267static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1268 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001269{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001270 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001271 case OBJ_REQUEST_BIO:
1272 zero_bios(&obj_req->bio_pos, off, bytes);
1273 break;
1274 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001275 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001276 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1277 break;
1278 default:
1279 rbd_assert(0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001280 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001281}
1282
1283static void rbd_obj_request_destroy(struct kref *kref);
1284static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1285{
1286 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001287 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001288 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001289 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1290}
1291
Alex Elder0f2d5be2014-04-26 14:21:44 +04001292static void rbd_img_request_get(struct rbd_img_request *img_request)
1293{
1294 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001295 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001296 kref_get(&img_request->kref);
1297}
1298
Alex Elderbf0d5f502012-11-22 00:00:08 -06001299static void rbd_img_request_destroy(struct kref *kref);
1300static void rbd_img_request_put(struct rbd_img_request *img_request)
1301{
1302 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001303 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001304 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001305 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001306}
1307
1308static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1309 struct rbd_obj_request *obj_request)
1310{
Alex Elder25dcf952013-01-25 17:08:55 -06001311 rbd_assert(obj_request->img_request == NULL);
1312
Alex Elderb155e862013-04-15 14:50:37 -05001313 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001314 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001315 img_request->obj_request_count++;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01001316 img_request->pending_count++;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001317 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001318}
1319
1320static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1321 struct rbd_obj_request *obj_request)
1322{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001323 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001324 list_del(&obj_request->ex.oe_item);
Alex Elder25dcf952013-01-25 17:08:55 -06001325 rbd_assert(img_request->obj_request_count > 0);
1326 img_request->obj_request_count--;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001327 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001328 rbd_obj_request_put(obj_request);
1329}
1330
Ilya Dryomov980917f2016-09-12 18:59:42 +02001331static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001332{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001333 struct ceph_osd_request *osd_req = obj_request->osd_req;
1334
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001335 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001336 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1337 obj_request->ex.oe_len, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001338 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001339}
1340
Alex Elder0c425242013-02-08 09:55:49 -06001341/*
1342 * The default/initial value for all image request flags is 0. Each
1343 * is conditionally set to 1 at image request initialization time
1344 * and currently never change thereafter.
1345 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001346static void img_request_layered_set(struct rbd_img_request *img_request)
1347{
1348 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1349 smp_mb();
1350}
1351
Alex Eldera2acd002013-05-08 22:50:04 -05001352static void img_request_layered_clear(struct rbd_img_request *img_request)
1353{
1354 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1355 smp_mb();
1356}
1357
Alex Elderd0b2e942013-01-24 16:13:36 -06001358static bool img_request_layered_test(struct rbd_img_request *img_request)
1359{
1360 smp_mb();
1361 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1362}
1363
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001364static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001365{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001366 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1367
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001368 return !obj_req->ex.oe_off &&
1369 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001370}
1371
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001372static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001373{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001374 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001375
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001376 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001377 rbd_dev->layout.object_size;
1378}
1379
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001380static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1381{
1382 return ceph_file_extents_bytes(obj_req->img_extents,
1383 obj_req->num_img_extents);
1384}
1385
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001386static bool rbd_img_is_write(struct rbd_img_request *img_req)
1387{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001388 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001389 case OBJ_OP_READ:
1390 return false;
1391 case OBJ_OP_WRITE:
1392 case OBJ_OP_DISCARD:
1393 return true;
1394 default:
1395 rbd_assert(0);
Alex Elder6e2a4502013-03-27 09:16:30 -05001396 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001397}
1398
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001399static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
Ilya Dryomov27617132015-07-16 17:36:11 +03001400
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001401static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001402{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001403 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001404
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001405 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1406 osd_req->r_result, obj_req);
1407 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001408
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001409 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1410 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1411 obj_req->xferred = osd_req->r_result;
1412 else
1413 /*
1414 * Writes aren't allowed to return a data payload. In some
1415 * guarded write cases (e.g. stat + zero on an empty object)
1416 * a stat response makes it through, but we don't care.
1417 */
1418 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001419
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001420 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001421}
1422
Alex Elder9d4df012013-04-19 15:34:50 -05001423static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001424{
Alex Elder8c042b02013-04-03 01:28:58 -05001425 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001426
Ilya Dryomova162b302018-01-30 17:52:10 +01001427 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001428 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001429}
1430
1431static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1432{
Alex Elder9d4df012013-04-19 15:34:50 -05001433 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001434
Ilya Dryomova162b302018-01-30 17:52:10 +01001435 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Deepa Dinamani1134e092017-05-08 15:59:19 -07001436 ktime_get_real_ts(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001437 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001438}
1439
Ilya Dryomovbc812072017-01-25 18:16:23 +01001440static struct ceph_osd_request *
Ilya Dryomova162b302018-01-30 17:52:10 +01001441rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001442{
Ilya Dryomova162b302018-01-30 17:52:10 +01001443 struct rbd_img_request *img_req = obj_req->img_request;
1444 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001445 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1446 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001447 const char *name_format = rbd_dev->image_format == 1 ?
1448 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001449
Ilya Dryomova162b302018-01-30 17:52:10 +01001450 req = ceph_osdc_alloc_request(osdc,
1451 (rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1452 num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001453 if (!req)
1454 return NULL;
1455
Ilya Dryomovbc812072017-01-25 18:16:23 +01001456 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001457 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001458
1459 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001460 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001461 rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001462 goto err_req;
1463
1464 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1465 goto err_req;
1466
1467 return req;
1468
1469err_req:
1470 ceph_osdc_put_request(req);
1471 return NULL;
1472}
1473
Alex Elderbf0d5f502012-11-22 00:00:08 -06001474static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1475{
1476 ceph_osdc_put_request(osd_req);
1477}
1478
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001479static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001480{
1481 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001482
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001483 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001484 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001485 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001486
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001487 ceph_object_extent_init(&obj_request->ex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001488 kref_init(&obj_request->kref);
1489
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001490 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001491 return obj_request;
1492}
1493
1494static void rbd_obj_request_destroy(struct kref *kref)
1495{
1496 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001497 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001498
1499 obj_request = container_of(kref, struct rbd_obj_request, kref);
1500
Alex Elder37206ee2013-02-20 17:32:08 -06001501 dout("%s: obj %p\n", __func__, obj_request);
1502
Alex Elderbf0d5f502012-11-22 00:00:08 -06001503 if (obj_request->osd_req)
1504 rbd_osd_req_destroy(obj_request->osd_req);
1505
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001506 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001507 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001508 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001509 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001510 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001511 case OBJ_REQUEST_OWN_BVECS:
1512 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001513 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001514 default:
1515 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001516 }
1517
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001518 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001519 if (obj_request->copyup_bvecs) {
1520 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1521 if (obj_request->copyup_bvecs[i].bv_page)
1522 __free_page(obj_request->copyup_bvecs[i].bv_page);
1523 }
1524 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001525 }
1526
Alex Elder868311b2013-05-01 12:43:03 -05001527 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001528}
1529
Alex Elderfb65d2282013-05-08 22:50:04 -05001530/* It's OK to call this for a device with no parent */
1531
1532static void rbd_spec_put(struct rbd_spec *spec);
1533static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1534{
1535 rbd_dev_remove_parent(rbd_dev);
1536 rbd_spec_put(rbd_dev->parent_spec);
1537 rbd_dev->parent_spec = NULL;
1538 rbd_dev->parent_overlap = 0;
1539}
1540
Alex Elderbf0d5f502012-11-22 00:00:08 -06001541/*
Alex Eldera2acd002013-05-08 22:50:04 -05001542 * Parent image reference counting is used to determine when an
1543 * image's parent fields can be safely torn down--after there are no
1544 * more in-flight requests to the parent image. When the last
1545 * reference is dropped, cleaning them up is safe.
1546 */
1547static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1548{
1549 int counter;
1550
1551 if (!rbd_dev->parent_spec)
1552 return;
1553
1554 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1555 if (counter > 0)
1556 return;
1557
1558 /* Last reference; clean up parent data structures */
1559
1560 if (!counter)
1561 rbd_dev_unparent(rbd_dev);
1562 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001563 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001564}
1565
1566/*
1567 * If an image has a non-zero parent overlap, get a reference to its
1568 * parent.
1569 *
1570 * Returns true if the rbd device has a parent with a non-zero
1571 * overlap and a reference for it was successfully taken, or
1572 * false otherwise.
1573 */
1574static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1575{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001576 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001577
1578 if (!rbd_dev->parent_spec)
1579 return false;
1580
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001581 down_read(&rbd_dev->header_rwsem);
1582 if (rbd_dev->parent_overlap)
1583 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1584 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001585
1586 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001587 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001588
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001589 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001590}
1591
Alex Elderbf0d5f502012-11-22 00:00:08 -06001592/*
1593 * Caller is responsible for filling in the list of object requests
1594 * that comprises the image request, and the Linux request pointer
1595 * (if there is one).
1596 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001597static struct rbd_img_request *rbd_img_request_create(
1598 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001599 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001600 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001601{
1602 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001603
Ilya Dryomova0c58952018-01-22 16:03:06 +01001604 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001605 if (!img_request)
1606 return NULL;
1607
Alex Elderbf0d5f502012-11-22 00:00:08 -06001608 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001609 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001610 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001611 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001612 else
1613 img_request->snapc = snapc;
1614
Alex Eldera2acd002013-05-08 22:50:04 -05001615 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001616 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001617
Alex Elderbf0d5f502012-11-22 00:00:08 -06001618 spin_lock_init(&img_request->completion_lock);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001619 INIT_LIST_HEAD(&img_request->object_extents);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001620 kref_init(&img_request->kref);
1621
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001622 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1623 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001624 return img_request;
1625}
1626
1627static void rbd_img_request_destroy(struct kref *kref)
1628{
1629 struct rbd_img_request *img_request;
1630 struct rbd_obj_request *obj_request;
1631 struct rbd_obj_request *next_obj_request;
1632
1633 img_request = container_of(kref, struct rbd_img_request, kref);
1634
Alex Elder37206ee2013-02-20 17:32:08 -06001635 dout("%s: img %p\n", __func__, img_request);
1636
Alex Elderbf0d5f502012-11-22 00:00:08 -06001637 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1638 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001639 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001640
Alex Eldera2acd002013-05-08 22:50:04 -05001641 if (img_request_layered_test(img_request)) {
1642 img_request_layered_clear(img_request);
1643 rbd_dev_parent_put(img_request->rbd_dev);
1644 }
1645
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001646 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001647 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001648
Alex Elder1c2a9df2013-05-01 12:43:03 -05001649 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001650}
1651
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001652static void prune_extents(struct ceph_file_extent *img_extents,
1653 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05001654{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001655 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05001656
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001657 /* drop extents completely beyond the overlap */
1658 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1659 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05001660
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001661 if (cnt) {
1662 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05001663
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001664 /* trim final overlapping extent */
1665 if (ex->fe_off + ex->fe_len > overlap)
1666 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06001667 }
1668
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001669 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05001670}
1671
Alex Elderf1a47392013-04-19 15:34:50 -05001672/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001673 * Determine the byte range(s) covered by either just the object extent
1674 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001675 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001676static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1677 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001678{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001679 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001680 int ret;
1681
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001682 if (!rbd_dev->parent_overlap)
1683 return 0;
1684
1685 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1686 entire ? 0 : obj_req->ex.oe_off,
1687 entire ? rbd_dev->layout.object_size :
1688 obj_req->ex.oe_len,
1689 &obj_req->img_extents,
1690 &obj_req->num_img_extents);
1691 if (ret)
1692 return ret;
1693
1694 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1695 rbd_dev->parent_overlap);
1696 return 0;
1697}
1698
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001699static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1700{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001701 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001702 case OBJ_REQUEST_BIO:
1703 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1704 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001705 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001706 break;
1707 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001708 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001709 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001710 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001711 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001712 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1713 &obj_req->bvec_pos);
1714 break;
1715 default:
1716 rbd_assert(0);
1717 }
1718}
1719
1720static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1721{
Ilya Dryomova162b302018-01-30 17:52:10 +01001722 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001723 if (!obj_req->osd_req)
Ilya Dryomov710214e2016-09-15 17:53:32 +02001724 return -ENOMEM;
1725
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001726 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001727 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001728 rbd_osd_req_setup_data(obj_req, 0);
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001729
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001730 rbd_osd_req_format_read(obj_req);
1731 return 0;
1732}
1733
1734static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1735 unsigned int which)
1736{
1737 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001738
Alex Elderc5b5ef62013-02-11 12:33:24 -06001739 /*
1740 * The response data for a STAT call consists of:
1741 * le64 length;
1742 * struct {
1743 * le32 tv_sec;
1744 * le32 tv_nsec;
1745 * } mtime;
1746 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001747 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1748 if (IS_ERR(pages))
1749 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06001750
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001751 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1752 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1753 8 + sizeof(struct ceph_timespec),
1754 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001755 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001756}
1757
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001758static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1759 unsigned int which)
Alex Elderb454e362013-04-19 15:34:50 -05001760{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001761 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1762 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05001763
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001764 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
1765 rbd_dev->layout.object_size,
1766 rbd_dev->layout.object_size);
Alex Elderb454e362013-04-19 15:34:50 -05001767
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001768 if (rbd_obj_is_entire(obj_req))
1769 opcode = CEPH_OSD_OP_WRITEFULL;
1770 else
1771 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001772
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001773 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001774 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001775 rbd_osd_req_setup_data(obj_req, which++);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001776
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001777 rbd_assert(which == obj_req->osd_req->r_num_ops);
1778 rbd_osd_req_format_write(obj_req);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001779}
1780
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001781static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001782{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001783 unsigned int num_osd_ops, which = 0;
1784 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02001785
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001786 /* reverse map the entire object onto the parent */
1787 ret = rbd_obj_calc_img_extents(obj_req, true);
1788 if (ret)
1789 return ret;
1790
1791 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001792 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1793 num_osd_ops = 3; /* stat + setallochint + write/writefull */
1794 } else {
1795 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1796 num_osd_ops = 2; /* setallochint + write/writefull */
1797 }
1798
Ilya Dryomova162b302018-01-30 17:52:10 +01001799 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001800 if (!obj_req->osd_req)
1801 return -ENOMEM;
1802
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001803 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001804 ret = __rbd_obj_setup_stat(obj_req, which++);
1805 if (ret)
1806 return ret;
1807 }
1808
1809 __rbd_obj_setup_write(obj_req, which);
1810 return 0;
1811}
1812
1813static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
1814 unsigned int which)
1815{
1816 u16 opcode;
1817
1818 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001819 if (obj_req->num_img_extents) {
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001820 osd_req_op_init(obj_req->osd_req, which++,
1821 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001822 opcode = CEPH_OSD_OP_TRUNCATE;
1823 } else {
1824 osd_req_op_init(obj_req->osd_req, which++,
1825 CEPH_OSD_OP_DELETE, 0);
1826 opcode = 0;
1827 }
1828 } else if (rbd_obj_is_tail(obj_req)) {
1829 opcode = CEPH_OSD_OP_TRUNCATE;
1830 } else {
1831 opcode = CEPH_OSD_OP_ZERO;
1832 }
1833
1834 if (opcode)
1835 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001836 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001837 0, 0);
1838
1839 rbd_assert(which == obj_req->osd_req->r_num_ops);
1840 rbd_osd_req_format_write(obj_req);
1841}
1842
1843static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1844{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001845 unsigned int num_osd_ops, which = 0;
1846 int ret;
1847
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001848 /* reverse map the entire object onto the parent */
1849 ret = rbd_obj_calc_img_extents(obj_req, true);
1850 if (ret)
1851 return ret;
1852
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001853 if (rbd_obj_is_entire(obj_req)) {
1854 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001855 if (obj_req->num_img_extents)
1856 num_osd_ops = 2; /* create + truncate */
1857 else
1858 num_osd_ops = 1; /* delete */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001859 } else {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001860 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001861 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1862 num_osd_ops = 2; /* stat + truncate/zero */
1863 } else {
1864 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1865 num_osd_ops = 1; /* truncate/zero */
1866 }
1867 }
1868
Ilya Dryomova162b302018-01-30 17:52:10 +01001869 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001870 if (!obj_req->osd_req)
1871 return -ENOMEM;
1872
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001873 if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001874 ret = __rbd_obj_setup_stat(obj_req, which++);
1875 if (ret)
1876 return ret;
1877 }
1878
1879 __rbd_obj_setup_discard(obj_req, which);
1880 return 0;
1881}
1882
1883/*
1884 * For each object request in @img_req, allocate an OSD request, add
1885 * individual OSD ops and prepare them for submission. The number of
1886 * OSD ops depends on op_type and the overlap point (if any).
1887 */
1888static int __rbd_img_fill_request(struct rbd_img_request *img_req)
1889{
1890 struct rbd_obj_request *obj_req;
1891 int ret;
1892
1893 for_each_obj_request(img_req, obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001894 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001895 case OBJ_OP_READ:
1896 ret = rbd_obj_setup_read(obj_req);
1897 break;
1898 case OBJ_OP_WRITE:
1899 ret = rbd_obj_setup_write(obj_req);
1900 break;
1901 case OBJ_OP_DISCARD:
1902 ret = rbd_obj_setup_discard(obj_req);
1903 break;
1904 default:
1905 rbd_assert(0);
1906 }
1907 if (ret)
1908 return ret;
1909 }
1910
1911 return 0;
1912}
1913
Ilya Dryomov5a237812018-02-06 19:26:34 +01001914union rbd_img_fill_iter {
1915 struct ceph_bio_iter bio_iter;
1916 struct ceph_bvec_iter bvec_iter;
1917};
1918
1919struct rbd_img_fill_ctx {
1920 enum obj_request_type pos_type;
1921 union rbd_img_fill_iter *pos;
1922 union rbd_img_fill_iter iter;
1923 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01001924 ceph_object_extent_fn_t count_fn;
1925 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01001926};
1927
1928static struct ceph_object_extent *alloc_object_extent(void *arg)
1929{
1930 struct rbd_img_request *img_req = arg;
1931 struct rbd_obj_request *obj_req;
1932
1933 obj_req = rbd_obj_request_create();
1934 if (!obj_req)
1935 return NULL;
1936
1937 rbd_img_obj_request_add(img_req, obj_req);
1938 return &obj_req->ex;
1939}
1940
1941/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01001942 * While su != os && sc == 1 is technically not fancy (it's the same
1943 * layout as su == os && sc == 1), we can't use the nocopy path for it
1944 * because ->set_pos_fn() should be called only once per object.
1945 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
1946 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01001947 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001948static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
1949{
1950 return l->stripe_unit != l->object_size;
1951}
1952
1953static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
1954 struct ceph_file_extent *img_extents,
1955 u32 num_img_extents,
1956 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01001957{
1958 u32 i;
1959 int ret;
1960
1961 img_req->data_type = fctx->pos_type;
1962
1963 /*
1964 * Create object requests and set each object request's starting
1965 * position in the provided bio (list) or bio_vec array.
1966 */
1967 fctx->iter = *fctx->pos;
1968 for (i = 0; i < num_img_extents; i++) {
1969 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
1970 img_extents[i].fe_off,
1971 img_extents[i].fe_len,
1972 &img_req->object_extents,
1973 alloc_object_extent, img_req,
1974 fctx->set_pos_fn, &fctx->iter);
1975 if (ret)
1976 return ret;
1977 }
1978
1979 return __rbd_img_fill_request(img_req);
1980}
1981
Ilya Dryomovafb97882018-02-06 19:26:35 +01001982/*
1983 * Map a list of image extents to a list of object extents, create the
1984 * corresponding object requests (normally each to a different object,
1985 * but not always) and add them to @img_req. For each object request,
1986 * set up its data descriptor to point to the corresponding chunk(s) of
1987 * @fctx->pos data buffer.
1988 *
1989 * Because ceph_file_to_extents() will merge adjacent object extents
1990 * together, each object request's data descriptor may point to multiple
1991 * different chunks of @fctx->pos data buffer.
1992 *
1993 * @fctx->pos data buffer is assumed to be large enough.
1994 */
1995static int rbd_img_fill_request(struct rbd_img_request *img_req,
1996 struct ceph_file_extent *img_extents,
1997 u32 num_img_extents,
1998 struct rbd_img_fill_ctx *fctx)
1999{
2000 struct rbd_device *rbd_dev = img_req->rbd_dev;
2001 struct rbd_obj_request *obj_req;
2002 u32 i;
2003 int ret;
2004
2005 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2006 !rbd_layout_is_fancy(&rbd_dev->layout))
2007 return rbd_img_fill_request_nocopy(img_req, img_extents,
2008 num_img_extents, fctx);
2009
2010 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2011
2012 /*
2013 * Create object requests and determine ->bvec_count for each object
2014 * request. Note that ->bvec_count sum over all object requests may
2015 * be greater than the number of bio_vecs in the provided bio (list)
2016 * or bio_vec array because when mapped, those bio_vecs can straddle
2017 * stripe unit boundaries.
2018 */
2019 fctx->iter = *fctx->pos;
2020 for (i = 0; i < num_img_extents; i++) {
2021 ret = ceph_file_to_extents(&rbd_dev->layout,
2022 img_extents[i].fe_off,
2023 img_extents[i].fe_len,
2024 &img_req->object_extents,
2025 alloc_object_extent, img_req,
2026 fctx->count_fn, &fctx->iter);
2027 if (ret)
2028 return ret;
2029 }
2030
2031 for_each_obj_request(img_req, obj_req) {
2032 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2033 sizeof(*obj_req->bvec_pos.bvecs),
2034 GFP_NOIO);
2035 if (!obj_req->bvec_pos.bvecs)
2036 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002037 }
2038
2039 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002040 * Fill in each object request's private bio_vec array, splitting and
2041 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002042 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002043 fctx->iter = *fctx->pos;
2044 for (i = 0; i < num_img_extents; i++) {
2045 ret = ceph_iterate_extents(&rbd_dev->layout,
2046 img_extents[i].fe_off,
2047 img_extents[i].fe_len,
2048 &img_req->object_extents,
2049 fctx->copy_fn, &fctx->iter);
2050 if (ret)
2051 return ret;
2052 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002053
Ilya Dryomovafb97882018-02-06 19:26:35 +01002054 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002055}
2056
Ilya Dryomov5a237812018-02-06 19:26:34 +01002057static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2058 u64 off, u64 len)
2059{
2060 struct ceph_file_extent ex = { off, len };
2061 union rbd_img_fill_iter dummy;
2062 struct rbd_img_fill_ctx fctx = {
2063 .pos_type = OBJ_REQUEST_NODATA,
2064 .pos = &dummy,
2065 };
2066
2067 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2068}
2069
2070static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2071{
2072 struct rbd_obj_request *obj_req =
2073 container_of(ex, struct rbd_obj_request, ex);
2074 struct ceph_bio_iter *it = arg;
2075
2076 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2077 obj_req->bio_pos = *it;
2078 ceph_bio_iter_advance(it, bytes);
2079}
2080
Ilya Dryomovafb97882018-02-06 19:26:35 +01002081static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2082{
2083 struct rbd_obj_request *obj_req =
2084 container_of(ex, struct rbd_obj_request, ex);
2085 struct ceph_bio_iter *it = arg;
2086
2087 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2088 ceph_bio_iter_advance_step(it, bytes, ({
2089 obj_req->bvec_count++;
2090 }));
2091
2092}
2093
2094static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2095{
2096 struct rbd_obj_request *obj_req =
2097 container_of(ex, struct rbd_obj_request, ex);
2098 struct ceph_bio_iter *it = arg;
2099
2100 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2101 ceph_bio_iter_advance_step(it, bytes, ({
2102 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2103 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2104 }));
2105}
2106
Ilya Dryomov5a237812018-02-06 19:26:34 +01002107static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2108 struct ceph_file_extent *img_extents,
2109 u32 num_img_extents,
2110 struct ceph_bio_iter *bio_pos)
2111{
2112 struct rbd_img_fill_ctx fctx = {
2113 .pos_type = OBJ_REQUEST_BIO,
2114 .pos = (union rbd_img_fill_iter *)bio_pos,
2115 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002116 .count_fn = count_bio_bvecs,
2117 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002118 };
2119
2120 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2121 &fctx);
2122}
2123
2124static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2125 u64 off, u64 len, struct bio *bio)
2126{
2127 struct ceph_file_extent ex = { off, len };
2128 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2129
2130 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2131}
2132
2133static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2134{
2135 struct rbd_obj_request *obj_req =
2136 container_of(ex, struct rbd_obj_request, ex);
2137 struct ceph_bvec_iter *it = arg;
2138
2139 obj_req->bvec_pos = *it;
2140 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2141 ceph_bvec_iter_advance(it, bytes);
2142}
2143
Ilya Dryomovafb97882018-02-06 19:26:35 +01002144static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2145{
2146 struct rbd_obj_request *obj_req =
2147 container_of(ex, struct rbd_obj_request, ex);
2148 struct ceph_bvec_iter *it = arg;
2149
2150 ceph_bvec_iter_advance_step(it, bytes, ({
2151 obj_req->bvec_count++;
2152 }));
2153}
2154
2155static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2156{
2157 struct rbd_obj_request *obj_req =
2158 container_of(ex, struct rbd_obj_request, ex);
2159 struct ceph_bvec_iter *it = arg;
2160
2161 ceph_bvec_iter_advance_step(it, bytes, ({
2162 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2163 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2164 }));
2165}
2166
Ilya Dryomov5a237812018-02-06 19:26:34 +01002167static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2168 struct ceph_file_extent *img_extents,
2169 u32 num_img_extents,
2170 struct ceph_bvec_iter *bvec_pos)
2171{
2172 struct rbd_img_fill_ctx fctx = {
2173 .pos_type = OBJ_REQUEST_BVECS,
2174 .pos = (union rbd_img_fill_iter *)bvec_pos,
2175 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002176 .count_fn = count_bvecs,
2177 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002178 };
2179
2180 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2181 &fctx);
2182}
2183
2184static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2185 struct ceph_file_extent *img_extents,
2186 u32 num_img_extents,
2187 struct bio_vec *bvecs)
2188{
2189 struct ceph_bvec_iter it = {
2190 .bvecs = bvecs,
2191 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2192 num_img_extents) },
2193 };
2194
2195 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2196 &it);
2197}
2198
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002199static void rbd_img_request_submit(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002200{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002201 struct rbd_obj_request *obj_request;
2202
Alex Elder37206ee2013-02-20 17:32:08 -06002203 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002204
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002205 rbd_img_request_get(img_request);
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002206 for_each_obj_request(img_request, obj_request)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002207 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002208
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002209 rbd_img_request_put(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002210}
2211
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002212static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002213{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002214 struct rbd_img_request *img_req = obj_req->img_request;
2215 struct rbd_img_request *child_img_req;
2216 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002217
Ilya Dryomove93aca02018-02-06 19:26:35 +01002218 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2219 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002220 if (!child_img_req)
2221 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002222
Ilya Dryomove93aca02018-02-06 19:26:35 +01002223 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2224 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002225
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002226 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002227 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002228 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002229 ret = __rbd_img_fill_from_bio(child_img_req,
2230 obj_req->img_extents,
2231 obj_req->num_img_extents,
2232 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002233 break;
2234 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002235 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002236 ret = __rbd_img_fill_from_bvecs(child_img_req,
2237 obj_req->img_extents,
2238 obj_req->num_img_extents,
2239 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002240 break;
2241 default:
2242 rbd_assert(0);
2243 }
2244 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002245 ret = rbd_img_fill_from_bvecs(child_img_req,
2246 obj_req->img_extents,
2247 obj_req->num_img_extents,
2248 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002249 }
2250 if (ret) {
2251 rbd_img_request_put(child_img_req);
2252 return ret;
2253 }
2254
2255 rbd_img_request_submit(child_img_req);
2256 return 0;
2257}
2258
2259static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2260{
2261 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2262 int ret;
2263
2264 if (obj_req->result == -ENOENT &&
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002265 rbd_dev->parent_overlap && !obj_req->tried_parent) {
2266 /* reverse map this object extent onto the parent */
2267 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002268 if (ret) {
2269 obj_req->result = ret;
2270 return true;
2271 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002272
2273 if (obj_req->num_img_extents) {
2274 obj_req->tried_parent = true;
2275 ret = rbd_obj_read_from_parent(obj_req);
2276 if (ret) {
2277 obj_req->result = ret;
2278 return true;
2279 }
2280 return false;
2281 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002282 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002283
2284 /*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002285 * -ENOENT means a hole in the image -- zero-fill the entire
2286 * length of the request. A short read also implies zero-fill
2287 * to the end of the request. In both cases we update xferred
2288 * count to indicate the whole request was satisfied.
Alex Elder02c74fb2013-05-06 17:40:33 -05002289 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002290 if (obj_req->result == -ENOENT ||
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002291 (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002292 rbd_assert(!obj_req->xferred || !obj_req->result);
2293 rbd_obj_zero_range(obj_req, obj_req->xferred,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002294 obj_req->ex.oe_len - obj_req->xferred);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002295 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002296 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002297 }
2298
2299 return true;
2300}
2301
2302/*
2303 * copyup_bvecs pages are never highmem pages
2304 */
2305static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2306{
2307 struct ceph_bvec_iter it = {
2308 .bvecs = bvecs,
2309 .iter = { .bi_size = bytes },
2310 };
2311
2312 ceph_bvec_iter_advance_step(&it, bytes, ({
2313 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2314 bv.bv_len))
2315 return false;
2316 }));
2317 return true;
2318}
2319
2320static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2321{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002322 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
2323
2324 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2325 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2326 rbd_osd_req_destroy(obj_req->osd_req);
2327
2328 /*
2329 * Create a copyup request with the same number of OSD ops as
2330 * the original request. The original request was stat + op(s),
2331 * the new copyup request will be copyup + the same op(s).
2332 */
Ilya Dryomova162b302018-01-30 17:52:10 +01002333 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002334 if (!obj_req->osd_req)
2335 return -ENOMEM;
2336
2337 /*
2338 * Only send non-zero copyup data to save some I/O and network
2339 * bandwidth -- zero copyup data is equivalent to the object not
2340 * existing.
2341 */
2342 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2343 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2344 bytes = 0;
2345 }
2346
2347 osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
2348 "copyup");
2349 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
2350 obj_req->copyup_bvecs, bytes);
2351
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002352 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002353 case OBJ_OP_WRITE:
2354 __rbd_obj_setup_write(obj_req, 1);
2355 break;
2356 case OBJ_OP_DISCARD:
2357 rbd_assert(!rbd_obj_is_entire(obj_req));
2358 __rbd_obj_setup_discard(obj_req, 1);
2359 break;
2360 default:
2361 rbd_assert(0);
2362 }
2363
2364 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002365 return 0;
2366}
2367
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002368static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2369{
2370 u32 i;
2371
2372 rbd_assert(!obj_req->copyup_bvecs);
2373 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2374 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2375 sizeof(*obj_req->copyup_bvecs),
2376 GFP_NOIO);
2377 if (!obj_req->copyup_bvecs)
2378 return -ENOMEM;
2379
2380 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2381 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2382
2383 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2384 if (!obj_req->copyup_bvecs[i].bv_page)
2385 return -ENOMEM;
2386
2387 obj_req->copyup_bvecs[i].bv_offset = 0;
2388 obj_req->copyup_bvecs[i].bv_len = len;
2389 obj_overlap -= len;
2390 }
2391
2392 rbd_assert(!obj_overlap);
2393 return 0;
2394}
2395
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002396static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2397{
2398 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002399 int ret;
2400
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002401 rbd_assert(obj_req->num_img_extents);
2402 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2403 rbd_dev->parent_overlap);
2404 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002405 /*
2406 * The overlap has become 0 (most likely because the
2407 * image has been flattened). Use rbd_obj_issue_copyup()
2408 * to re-submit the original write request -- the copyup
2409 * operation itself will be a no-op, since someone must
2410 * have populated the child object while we weren't
2411 * looking. Move to WRITE_FLAT state as we'll be done
2412 * with the operation once the null copyup completes.
2413 */
2414 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2415 return rbd_obj_issue_copyup(obj_req, 0);
2416 }
2417
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002418 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002419 if (ret)
2420 return ret;
2421
2422 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002423 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002424}
2425
2426static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2427{
2428 int ret;
2429
2430again:
2431 switch (obj_req->write_state) {
2432 case RBD_OBJ_WRITE_GUARD:
2433 rbd_assert(!obj_req->xferred);
2434 if (obj_req->result == -ENOENT) {
2435 /*
2436 * The target object doesn't exist. Read the data for
2437 * the entire target object up to the overlap point (if
2438 * any) from the parent, so we can use it for a copyup.
2439 */
2440 ret = rbd_obj_handle_write_guard(obj_req);
2441 if (ret) {
2442 obj_req->result = ret;
2443 return true;
2444 }
2445 return false;
2446 }
2447 /* fall through */
2448 case RBD_OBJ_WRITE_FLAT:
2449 if (!obj_req->result)
2450 /*
2451 * There is no such thing as a successful short
2452 * write -- indicate the whole request was satisfied.
2453 */
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002454 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002455 return true;
2456 case RBD_OBJ_WRITE_COPYUP:
2457 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2458 if (obj_req->result)
2459 goto again;
2460
2461 rbd_assert(obj_req->xferred);
2462 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2463 if (ret) {
2464 obj_req->result = ret;
2465 return true;
2466 }
2467 return false;
2468 default:
2469 rbd_assert(0);
2470 }
2471}
2472
2473/*
2474 * Returns true if @obj_req is completed, or false otherwise.
2475 */
2476static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2477{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002478 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002479 case OBJ_OP_READ:
2480 return rbd_obj_handle_read(obj_req);
2481 case OBJ_OP_WRITE:
2482 return rbd_obj_handle_write(obj_req);
2483 case OBJ_OP_DISCARD:
2484 if (rbd_obj_handle_write(obj_req)) {
2485 /*
2486 * Hide -ENOENT from delete/truncate/zero -- discarding
2487 * a non-existent object is not a problem.
2488 */
2489 if (obj_req->result == -ENOENT) {
2490 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002491 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002492 }
2493 return true;
2494 }
2495 return false;
2496 default:
2497 rbd_assert(0);
2498 }
2499}
2500
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002501static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2502{
2503 struct rbd_img_request *img_req = obj_req->img_request;
2504
2505 rbd_assert((!obj_req->result &&
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002506 obj_req->xferred == obj_req->ex.oe_len) ||
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002507 (obj_req->result < 0 && !obj_req->xferred));
2508 if (!obj_req->result) {
2509 img_req->xferred += obj_req->xferred;
Ilya Dryomov980917f2016-09-12 18:59:42 +02002510 return;
Alex Elder02c74fb2013-05-06 17:40:33 -05002511 }
2512
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002513 rbd_warn(img_req->rbd_dev,
2514 "%s at objno %llu %llu~%llu result %d xferred %llu",
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002515 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2516 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002517 obj_req->xferred);
2518 if (!img_req->result) {
2519 img_req->result = obj_req->result;
2520 img_req->xferred = 0;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002521 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06002522}
2523
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002524static void rbd_img_end_child_request(struct rbd_img_request *img_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002525{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002526 struct rbd_obj_request *obj_req = img_req->obj_request;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002527
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002528 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002529 rbd_assert((!img_req->result &&
2530 img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
2531 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002532
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002533 obj_req->result = img_req->result;
2534 obj_req->xferred = img_req->xferred;
2535 rbd_img_request_put(img_req);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002536}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002537
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002538static void rbd_img_end_request(struct rbd_img_request *img_req)
2539{
2540 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2541 rbd_assert((!img_req->result &&
2542 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2543 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002544
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002545 blk_mq_end_request(img_req->rq,
2546 errno_to_blk_status(img_req->result));
2547 rbd_img_request_put(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002548}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002549
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002550static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2551{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002552 struct rbd_img_request *img_req;
2553
2554again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002555 if (!__rbd_obj_handle_request(obj_req))
2556 return;
2557
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002558 img_req = obj_req->img_request;
2559 spin_lock(&img_req->completion_lock);
2560 rbd_obj_end_request(obj_req);
2561 rbd_assert(img_req->pending_count);
2562 if (--img_req->pending_count) {
2563 spin_unlock(&img_req->completion_lock);
2564 return;
2565 }
2566
2567 spin_unlock(&img_req->completion_lock);
2568 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2569 obj_req = img_req->obj_request;
2570 rbd_img_end_child_request(img_req);
2571 goto again;
2572 }
2573 rbd_img_end_request(img_req);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002574}
2575
Ilya Dryomoved95b212016-08-12 16:40:02 +02002576static const struct rbd_client_id rbd_empty_cid;
2577
2578static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2579 const struct rbd_client_id *rhs)
2580{
2581 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2582}
2583
2584static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2585{
2586 struct rbd_client_id cid;
2587
2588 mutex_lock(&rbd_dev->watch_mutex);
2589 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2590 cid.handle = rbd_dev->watch_cookie;
2591 mutex_unlock(&rbd_dev->watch_mutex);
2592 return cid;
2593}
2594
2595/*
2596 * lock_rwsem must be held for write
2597 */
2598static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2599 const struct rbd_client_id *cid)
2600{
2601 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2602 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2603 cid->gid, cid->handle);
2604 rbd_dev->owner_cid = *cid; /* struct */
2605}
2606
2607static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2608{
2609 mutex_lock(&rbd_dev->watch_mutex);
2610 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2611 mutex_unlock(&rbd_dev->watch_mutex);
2612}
2613
Florian Margaineedd8ca82017-12-13 16:43:59 +01002614static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2615{
2616 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2617
2618 strcpy(rbd_dev->lock_cookie, cookie);
2619 rbd_set_owner_cid(rbd_dev, &cid);
2620 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2621}
2622
Ilya Dryomoved95b212016-08-12 16:40:02 +02002623/*
2624 * lock_rwsem must be held for write
2625 */
2626static int rbd_lock(struct rbd_device *rbd_dev)
2627{
2628 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002629 char cookie[32];
2630 int ret;
2631
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002632 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2633 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002634
2635 format_lock_cookie(rbd_dev, cookie);
2636 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2637 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2638 RBD_LOCK_TAG, "", 0);
2639 if (ret)
2640 return ret;
2641
2642 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002643 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002644 return 0;
2645}
2646
2647/*
2648 * lock_rwsem must be held for write
2649 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002650static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002651{
2652 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002653 int ret;
2654
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002655 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2656 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002657
Ilya Dryomoved95b212016-08-12 16:40:02 +02002658 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002659 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002660 if (ret && ret != -ENOENT)
2661 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002662
Ilya Dryomovbbead742017-04-13 12:17:38 +02002663 /* treat errors as the image is unlocked */
2664 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002665 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002666 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2667 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002668}
2669
2670static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2671 enum rbd_notify_op notify_op,
2672 struct page ***preply_pages,
2673 size_t *preply_len)
2674{
2675 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2676 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07002677 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
2678 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002679 void *p = buf;
2680
2681 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2682
2683 /* encode *LockPayload NotifyMessage (op + ClientId) */
2684 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2685 ceph_encode_32(&p, notify_op);
2686 ceph_encode_64(&p, cid.gid);
2687 ceph_encode_64(&p, cid.handle);
2688
2689 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2690 &rbd_dev->header_oloc, buf, buf_size,
2691 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2692}
2693
2694static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2695 enum rbd_notify_op notify_op)
2696{
2697 struct page **reply_pages;
2698 size_t reply_len;
2699
2700 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2701 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2702}
2703
2704static void rbd_notify_acquired_lock(struct work_struct *work)
2705{
2706 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2707 acquired_lock_work);
2708
2709 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2710}
2711
2712static void rbd_notify_released_lock(struct work_struct *work)
2713{
2714 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2715 released_lock_work);
2716
2717 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2718}
2719
2720static int rbd_request_lock(struct rbd_device *rbd_dev)
2721{
2722 struct page **reply_pages;
2723 size_t reply_len;
2724 bool lock_owner_responded = false;
2725 int ret;
2726
2727 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2728
2729 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2730 &reply_pages, &reply_len);
2731 if (ret && ret != -ETIMEDOUT) {
2732 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2733 goto out;
2734 }
2735
2736 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2737 void *p = page_address(reply_pages[0]);
2738 void *const end = p + reply_len;
2739 u32 n;
2740
2741 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2742 while (n--) {
2743 u8 struct_v;
2744 u32 len;
2745
2746 ceph_decode_need(&p, end, 8 + 8, e_inval);
2747 p += 8 + 8; /* skip gid and cookie */
2748
2749 ceph_decode_32_safe(&p, end, len, e_inval);
2750 if (!len)
2751 continue;
2752
2753 if (lock_owner_responded) {
2754 rbd_warn(rbd_dev,
2755 "duplicate lock owners detected");
2756 ret = -EIO;
2757 goto out;
2758 }
2759
2760 lock_owner_responded = true;
2761 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2762 &struct_v, &len);
2763 if (ret) {
2764 rbd_warn(rbd_dev,
2765 "failed to decode ResponseMessage: %d",
2766 ret);
2767 goto e_inval;
2768 }
2769
2770 ret = ceph_decode_32(&p);
2771 }
2772 }
2773
2774 if (!lock_owner_responded) {
2775 rbd_warn(rbd_dev, "no lock owners detected");
2776 ret = -ETIMEDOUT;
2777 }
2778
2779out:
2780 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2781 return ret;
2782
2783e_inval:
2784 ret = -EINVAL;
2785 goto out;
2786}
2787
2788static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2789{
2790 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2791
2792 cancel_delayed_work(&rbd_dev->lock_dwork);
2793 if (wake_all)
2794 wake_up_all(&rbd_dev->lock_waitq);
2795 else
2796 wake_up(&rbd_dev->lock_waitq);
2797}
2798
2799static int get_lock_owner_info(struct rbd_device *rbd_dev,
2800 struct ceph_locker **lockers, u32 *num_lockers)
2801{
2802 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2803 u8 lock_type;
2804 char *lock_tag;
2805 int ret;
2806
2807 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2808
2809 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2810 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2811 &lock_type, &lock_tag, lockers, num_lockers);
2812 if (ret)
2813 return ret;
2814
2815 if (*num_lockers == 0) {
2816 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2817 goto out;
2818 }
2819
2820 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2821 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2822 lock_tag);
2823 ret = -EBUSY;
2824 goto out;
2825 }
2826
2827 if (lock_type == CEPH_CLS_LOCK_SHARED) {
2828 rbd_warn(rbd_dev, "shared lock type detected");
2829 ret = -EBUSY;
2830 goto out;
2831 }
2832
2833 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2834 strlen(RBD_LOCK_COOKIE_PREFIX))) {
2835 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2836 (*lockers)[0].id.cookie);
2837 ret = -EBUSY;
2838 goto out;
2839 }
2840
2841out:
2842 kfree(lock_tag);
2843 return ret;
2844}
2845
2846static int find_watcher(struct rbd_device *rbd_dev,
2847 const struct ceph_locker *locker)
2848{
2849 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2850 struct ceph_watch_item *watchers;
2851 u32 num_watchers;
2852 u64 cookie;
2853 int i;
2854 int ret;
2855
2856 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2857 &rbd_dev->header_oloc, &watchers,
2858 &num_watchers);
2859 if (ret)
2860 return ret;
2861
2862 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2863 for (i = 0; i < num_watchers; i++) {
2864 if (!memcmp(&watchers[i].addr, &locker->info.addr,
2865 sizeof(locker->info.addr)) &&
2866 watchers[i].cookie == cookie) {
2867 struct rbd_client_id cid = {
2868 .gid = le64_to_cpu(watchers[i].name.num),
2869 .handle = cookie,
2870 };
2871
2872 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
2873 rbd_dev, cid.gid, cid.handle);
2874 rbd_set_owner_cid(rbd_dev, &cid);
2875 ret = 1;
2876 goto out;
2877 }
2878 }
2879
2880 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
2881 ret = 0;
2882out:
2883 kfree(watchers);
2884 return ret;
2885}
2886
2887/*
2888 * lock_rwsem must be held for write
2889 */
2890static int rbd_try_lock(struct rbd_device *rbd_dev)
2891{
2892 struct ceph_client *client = rbd_dev->rbd_client->client;
2893 struct ceph_locker *lockers;
2894 u32 num_lockers;
2895 int ret;
2896
2897 for (;;) {
2898 ret = rbd_lock(rbd_dev);
2899 if (ret != -EBUSY)
2900 return ret;
2901
2902 /* determine if the current lock holder is still alive */
2903 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
2904 if (ret)
2905 return ret;
2906
2907 if (num_lockers == 0)
2908 goto again;
2909
2910 ret = find_watcher(rbd_dev, lockers);
2911 if (ret) {
2912 if (ret > 0)
2913 ret = 0; /* have to request lock */
2914 goto out;
2915 }
2916
2917 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
2918 ENTITY_NAME(lockers[0].id.name));
2919
2920 ret = ceph_monc_blacklist_add(&client->monc,
2921 &lockers[0].info.addr);
2922 if (ret) {
2923 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
2924 ENTITY_NAME(lockers[0].id.name), ret);
2925 goto out;
2926 }
2927
2928 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
2929 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2930 lockers[0].id.cookie,
2931 &lockers[0].id.name);
2932 if (ret && ret != -ENOENT)
2933 goto out;
2934
2935again:
2936 ceph_free_lockers(lockers, num_lockers);
2937 }
2938
2939out:
2940 ceph_free_lockers(lockers, num_lockers);
2941 return ret;
2942}
2943
2944/*
2945 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
2946 */
2947static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
2948 int *pret)
2949{
2950 enum rbd_lock_state lock_state;
2951
2952 down_read(&rbd_dev->lock_rwsem);
2953 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
2954 rbd_dev->lock_state);
2955 if (__rbd_is_lock_owner(rbd_dev)) {
2956 lock_state = rbd_dev->lock_state;
2957 up_read(&rbd_dev->lock_rwsem);
2958 return lock_state;
2959 }
2960
2961 up_read(&rbd_dev->lock_rwsem);
2962 down_write(&rbd_dev->lock_rwsem);
2963 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
2964 rbd_dev->lock_state);
2965 if (!__rbd_is_lock_owner(rbd_dev)) {
2966 *pret = rbd_try_lock(rbd_dev);
2967 if (*pret)
2968 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
2969 }
2970
2971 lock_state = rbd_dev->lock_state;
2972 up_write(&rbd_dev->lock_rwsem);
2973 return lock_state;
2974}
2975
2976static void rbd_acquire_lock(struct work_struct *work)
2977{
2978 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
2979 struct rbd_device, lock_dwork);
2980 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08002981 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002982
2983 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2984again:
2985 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
2986 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
2987 if (lock_state == RBD_LOCK_STATE_LOCKED)
2988 wake_requests(rbd_dev, true);
2989 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
2990 rbd_dev, lock_state, ret);
2991 return;
2992 }
2993
2994 ret = rbd_request_lock(rbd_dev);
2995 if (ret == -ETIMEDOUT) {
2996 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02002997 } else if (ret == -EROFS) {
2998 rbd_warn(rbd_dev, "peer will not release lock");
2999 /*
3000 * If this is rbd_add_acquire_lock(), we want to fail
3001 * immediately -- reuse BLACKLISTED flag. Otherwise we
3002 * want to block.
3003 */
3004 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3005 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3006 /* wake "rbd map --exclusive" process */
3007 wake_requests(rbd_dev, false);
3008 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003009 } else if (ret < 0) {
3010 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3011 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3012 RBD_RETRY_DELAY);
3013 } else {
3014 /*
3015 * lock owner acked, but resend if we don't see them
3016 * release the lock
3017 */
3018 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3019 rbd_dev);
3020 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3021 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3022 }
3023}
3024
3025/*
3026 * lock_rwsem must be held for write
3027 */
3028static bool rbd_release_lock(struct rbd_device *rbd_dev)
3029{
3030 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3031 rbd_dev->lock_state);
3032 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3033 return false;
3034
3035 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3036 downgrade_write(&rbd_dev->lock_rwsem);
3037 /*
3038 * Ensure that all in-flight IO is flushed.
3039 *
3040 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3041 * may be shared with other devices.
3042 */
3043 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3044 up_read(&rbd_dev->lock_rwsem);
3045
3046 down_write(&rbd_dev->lock_rwsem);
3047 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3048 rbd_dev->lock_state);
3049 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3050 return false;
3051
Ilya Dryomovbbead742017-04-13 12:17:38 +02003052 rbd_unlock(rbd_dev);
3053 /*
3054 * Give others a chance to grab the lock - we would re-acquire
3055 * almost immediately if we got new IO during ceph_osdc_sync()
3056 * otherwise. We need to ack our own notifications, so this
3057 * lock_dwork will be requeued from rbd_wait_state_locked()
3058 * after wake_requests() in rbd_handle_released_lock().
3059 */
3060 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003061 return true;
3062}
3063
3064static void rbd_release_lock_work(struct work_struct *work)
3065{
3066 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3067 unlock_work);
3068
3069 down_write(&rbd_dev->lock_rwsem);
3070 rbd_release_lock(rbd_dev);
3071 up_write(&rbd_dev->lock_rwsem);
3072}
3073
3074static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3075 void **p)
3076{
3077 struct rbd_client_id cid = { 0 };
3078
3079 if (struct_v >= 2) {
3080 cid.gid = ceph_decode_64(p);
3081 cid.handle = ceph_decode_64(p);
3082 }
3083
3084 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3085 cid.handle);
3086 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3087 down_write(&rbd_dev->lock_rwsem);
3088 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3089 /*
3090 * we already know that the remote client is
3091 * the owner
3092 */
3093 up_write(&rbd_dev->lock_rwsem);
3094 return;
3095 }
3096
3097 rbd_set_owner_cid(rbd_dev, &cid);
3098 downgrade_write(&rbd_dev->lock_rwsem);
3099 } else {
3100 down_read(&rbd_dev->lock_rwsem);
3101 }
3102
3103 if (!__rbd_is_lock_owner(rbd_dev))
3104 wake_requests(rbd_dev, false);
3105 up_read(&rbd_dev->lock_rwsem);
3106}
3107
3108static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3109 void **p)
3110{
3111 struct rbd_client_id cid = { 0 };
3112
3113 if (struct_v >= 2) {
3114 cid.gid = ceph_decode_64(p);
3115 cid.handle = ceph_decode_64(p);
3116 }
3117
3118 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3119 cid.handle);
3120 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3121 down_write(&rbd_dev->lock_rwsem);
3122 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3123 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3124 __func__, rbd_dev, cid.gid, cid.handle,
3125 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3126 up_write(&rbd_dev->lock_rwsem);
3127 return;
3128 }
3129
3130 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3131 downgrade_write(&rbd_dev->lock_rwsem);
3132 } else {
3133 down_read(&rbd_dev->lock_rwsem);
3134 }
3135
3136 if (!__rbd_is_lock_owner(rbd_dev))
3137 wake_requests(rbd_dev, false);
3138 up_read(&rbd_dev->lock_rwsem);
3139}
3140
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003141/*
3142 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3143 * ResponseMessage is needed.
3144 */
3145static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3146 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003147{
3148 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3149 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003150 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003151
3152 if (struct_v >= 2) {
3153 cid.gid = ceph_decode_64(p);
3154 cid.handle = ceph_decode_64(p);
3155 }
3156
3157 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3158 cid.handle);
3159 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003160 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003161
3162 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003163 if (__rbd_is_lock_owner(rbd_dev)) {
3164 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3165 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3166 goto out_unlock;
3167
3168 /*
3169 * encode ResponseMessage(0) so the peer can detect
3170 * a missing owner
3171 */
3172 result = 0;
3173
3174 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003175 if (!rbd_dev->opts->exclusive) {
3176 dout("%s rbd_dev %p queueing unlock_work\n",
3177 __func__, rbd_dev);
3178 queue_work(rbd_dev->task_wq,
3179 &rbd_dev->unlock_work);
3180 } else {
3181 /* refuse to release the lock */
3182 result = -EROFS;
3183 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003184 }
3185 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003186
3187out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003188 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003189 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003190}
3191
3192static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3193 u64 notify_id, u64 cookie, s32 *result)
3194{
3195 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07003196 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3197 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003198 int ret;
3199
3200 if (result) {
3201 void *p = buf;
3202
3203 /* encode ResponseMessage */
3204 ceph_start_encoding(&p, 1, 1,
3205 buf_size - CEPH_ENCODING_START_BLK_LEN);
3206 ceph_encode_32(&p, *result);
3207 } else {
3208 buf_size = 0;
3209 }
3210
3211 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3212 &rbd_dev->header_oloc, notify_id, cookie,
3213 buf, buf_size);
3214 if (ret)
3215 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3216}
3217
3218static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3219 u64 cookie)
3220{
3221 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3222 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3223}
3224
3225static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3226 u64 notify_id, u64 cookie, s32 result)
3227{
3228 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3229 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3230}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003231
3232static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3233 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003234{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003235 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003236 void *p = data;
3237 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003238 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003239 u32 len;
3240 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003241 int ret;
3242
Ilya Dryomoved95b212016-08-12 16:40:02 +02003243 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3244 __func__, rbd_dev, cookie, notify_id, data_len);
3245 if (data_len) {
3246 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3247 &struct_v, &len);
3248 if (ret) {
3249 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3250 ret);
3251 return;
3252 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003253
Ilya Dryomoved95b212016-08-12 16:40:02 +02003254 notify_op = ceph_decode_32(&p);
3255 } else {
3256 /* legacy notification for header updates */
3257 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3258 len = 0;
3259 }
Alex Elderb8d70032012-11-30 17:53:04 -06003260
Ilya Dryomoved95b212016-08-12 16:40:02 +02003261 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3262 switch (notify_op) {
3263 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3264 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3265 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3266 break;
3267 case RBD_NOTIFY_OP_RELEASED_LOCK:
3268 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3269 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3270 break;
3271 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003272 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3273 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003274 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003275 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003276 else
3277 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3278 break;
3279 case RBD_NOTIFY_OP_HEADER_UPDATE:
3280 ret = rbd_dev_refresh(rbd_dev);
3281 if (ret)
3282 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3283
3284 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3285 break;
3286 default:
3287 if (rbd_is_lock_owner(rbd_dev))
3288 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3289 cookie, -EOPNOTSUPP);
3290 else
3291 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3292 break;
3293 }
Alex Elderb8d70032012-11-30 17:53:04 -06003294}
3295
Ilya Dryomov99d16942016-08-12 16:11:41 +02003296static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3297
Ilya Dryomov922dab62016-05-26 01:15:02 +02003298static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003299{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003300 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003301
Ilya Dryomov922dab62016-05-26 01:15:02 +02003302 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003303
Ilya Dryomoved95b212016-08-12 16:40:02 +02003304 down_write(&rbd_dev->lock_rwsem);
3305 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3306 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003307
Ilya Dryomov99d16942016-08-12 16:11:41 +02003308 mutex_lock(&rbd_dev->watch_mutex);
3309 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3310 __rbd_unregister_watch(rbd_dev);
3311 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003312
Ilya Dryomov99d16942016-08-12 16:11:41 +02003313 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003314 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003315 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003316}
3317
3318/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003319 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003320 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003321static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003322{
3323 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003324 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003325
Ilya Dryomov922dab62016-05-26 01:15:02 +02003326 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003327 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003328
Ilya Dryomov922dab62016-05-26 01:15:02 +02003329 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3330 &rbd_dev->header_oloc, rbd_watch_cb,
3331 rbd_watch_errcb, rbd_dev);
3332 if (IS_ERR(handle))
3333 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003334
Ilya Dryomov922dab62016-05-26 01:15:02 +02003335 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003336 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003337}
3338
Ilya Dryomov99d16942016-08-12 16:11:41 +02003339/*
3340 * watch_mutex must be locked
3341 */
3342static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003343{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003344 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3345 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003346
Ilya Dryomov99d16942016-08-12 16:11:41 +02003347 rbd_assert(rbd_dev->watch_handle);
3348 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003349
Ilya Dryomov922dab62016-05-26 01:15:02 +02003350 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3351 if (ret)
3352 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003353
Ilya Dryomov922dab62016-05-26 01:15:02 +02003354 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003355}
3356
Ilya Dryomov99d16942016-08-12 16:11:41 +02003357static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003358{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003359 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003360
Ilya Dryomov99d16942016-08-12 16:11:41 +02003361 mutex_lock(&rbd_dev->watch_mutex);
3362 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3363 ret = __rbd_register_watch(rbd_dev);
3364 if (ret)
3365 goto out;
3366
3367 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3368 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3369
3370out:
3371 mutex_unlock(&rbd_dev->watch_mutex);
3372 return ret;
3373}
3374
3375static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3376{
3377 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3378
3379 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003380 cancel_work_sync(&rbd_dev->acquired_lock_work);
3381 cancel_work_sync(&rbd_dev->released_lock_work);
3382 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3383 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003384}
3385
3386static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3387{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003388 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003389 cancel_tasks_sync(rbd_dev);
3390
3391 mutex_lock(&rbd_dev->watch_mutex);
3392 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3393 __rbd_unregister_watch(rbd_dev);
3394 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3395 mutex_unlock(&rbd_dev->watch_mutex);
3396
Ilya Dryomov811c6682016-04-15 16:22:16 +02003397 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003398}
3399
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003400/*
3401 * lock_rwsem must be held for write
3402 */
3403static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3404{
3405 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3406 char cookie[32];
3407 int ret;
3408
3409 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3410
3411 format_lock_cookie(rbd_dev, cookie);
3412 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3413 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3414 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3415 RBD_LOCK_TAG, cookie);
3416 if (ret) {
3417 if (ret != -EOPNOTSUPP)
3418 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3419 ret);
3420
3421 /*
3422 * Lock cookie cannot be updated on older OSDs, so do
3423 * a manual release and queue an acquire.
3424 */
3425 if (rbd_release_lock(rbd_dev))
3426 queue_delayed_work(rbd_dev->task_wq,
3427 &rbd_dev->lock_dwork, 0);
3428 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003429 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003430 }
3431}
3432
Ilya Dryomov99d16942016-08-12 16:11:41 +02003433static void rbd_reregister_watch(struct work_struct *work)
3434{
3435 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3436 struct rbd_device, watch_dwork);
3437 int ret;
3438
3439 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3440
3441 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003442 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3443 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003444 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003445 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003446
3447 ret = __rbd_register_watch(rbd_dev);
3448 if (ret) {
3449 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003450 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003451 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003452 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003453 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003454 queue_delayed_work(rbd_dev->task_wq,
3455 &rbd_dev->watch_dwork,
3456 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003457 }
3458 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003459 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003460 }
3461
3462 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3463 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3464 mutex_unlock(&rbd_dev->watch_mutex);
3465
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003466 down_write(&rbd_dev->lock_rwsem);
3467 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3468 rbd_reacquire_lock(rbd_dev);
3469 up_write(&rbd_dev->lock_rwsem);
3470
Ilya Dryomov99d16942016-08-12 16:11:41 +02003471 ret = rbd_dev_refresh(rbd_dev);
3472 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00003473 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003474}
3475
Alex Elder36be9a72013-01-19 00:30:28 -06003476/*
Alex Elderf40eb342013-04-25 15:09:42 -05003477 * Synchronous osd object method call. Returns the number of bytes
3478 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003479 */
3480static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003481 struct ceph_object_id *oid,
3482 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003483 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003484 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003485 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003486 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003487 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003488{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003489 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3490 struct page *req_page = NULL;
3491 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003492 int ret;
3493
3494 /*
Alex Elder6010a452013-04-05 01:27:11 -05003495 * Method calls are ultimately read operations. The result
3496 * should placed into the inbound buffer provided. They
3497 * also supply outbound data--parameters for the object
3498 * method. Currently if this is present it will be a
3499 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003500 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003501 if (outbound) {
3502 if (outbound_size > PAGE_SIZE)
3503 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003504
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003505 req_page = alloc_page(GFP_KERNEL);
3506 if (!req_page)
3507 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003508
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003509 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003510 }
Alex Elder430c28c2013-04-03 21:32:51 -05003511
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003512 reply_page = alloc_page(GFP_KERNEL);
3513 if (!reply_page) {
3514 if (req_page)
3515 __free_page(req_page);
3516 return -ENOMEM;
3517 }
Alex Elder36be9a72013-01-19 00:30:28 -06003518
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003519 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3520 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3521 reply_page, &inbound_size);
3522 if (!ret) {
3523 memcpy(inbound, page_address(reply_page), inbound_size);
3524 ret = inbound_size;
3525 }
Alex Elder57385b52013-04-21 12:14:45 -05003526
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003527 if (req_page)
3528 __free_page(req_page);
3529 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003530 return ret;
3531}
3532
Ilya Dryomoved95b212016-08-12 16:40:02 +02003533/*
3534 * lock_rwsem must be held for read
3535 */
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003536static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003537{
3538 DEFINE_WAIT(wait);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003539 int ret = 0;
3540
3541 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3542 return -EBLACKLISTED;
3543
3544 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3545 return 0;
3546
3547 if (!may_acquire) {
3548 rbd_warn(rbd_dev, "exclusive lock required");
3549 return -EROFS;
3550 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003551
3552 do {
3553 /*
3554 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3555 * and cancel_delayed_work() in wake_requests().
3556 */
3557 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3558 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3559 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3560 TASK_UNINTERRUPTIBLE);
3561 up_read(&rbd_dev->lock_rwsem);
3562 schedule();
3563 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003564 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3565 ret = -EBLACKLISTED;
3566 break;
3567 }
3568 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003569
Ilya Dryomoved95b212016-08-12 16:40:02 +02003570 finish_wait(&rbd_dev->lock_waitq, &wait);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003571 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003572}
3573
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003574static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003575{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003576 struct request *rq = blk_mq_rq_from_pdu(work);
3577 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003578 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003579 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003580 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3581 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003582 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003583 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003584 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003585 int result;
3586
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003587 switch (req_op(rq)) {
3588 case REQ_OP_DISCARD:
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003589 case REQ_OP_WRITE_ZEROES:
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003590 op_type = OBJ_OP_DISCARD;
3591 break;
3592 case REQ_OP_WRITE:
3593 op_type = OBJ_OP_WRITE;
3594 break;
3595 case REQ_OP_READ:
3596 op_type = OBJ_OP_READ;
3597 break;
3598 default:
3599 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003600 result = -EIO;
3601 goto err;
3602 }
3603
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003604 /* Ignore/skip any zero-length requests */
3605
3606 if (!length) {
3607 dout("%s: zero-length request\n", __func__);
3608 result = 0;
3609 goto err_rq;
3610 }
3611
Ilya Dryomov9568c932017-10-12 12:35:19 +02003612 rbd_assert(op_type == OBJ_OP_READ ||
3613 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003614
3615 /*
3616 * Quit early if the mapped snapshot no longer exists. It's
3617 * still possible the snapshot will have disappeared by the
3618 * time our request arrives at the osd, but there's no sense in
3619 * sending it if we already know.
3620 */
3621 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3622 dout("request for non-existent snapshot");
3623 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3624 result = -ENXIO;
3625 goto err_rq;
3626 }
3627
3628 if (offset && length > U64_MAX - offset + 1) {
3629 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3630 length);
3631 result = -EINVAL;
3632 goto err_rq; /* Shouldn't happen */
3633 }
3634
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003635 blk_mq_start_request(rq);
3636
Josh Durgin4e752f02014-04-08 11:12:11 -07003637 down_read(&rbd_dev->header_rwsem);
3638 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003639 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003640 snapc = rbd_dev->header.snapc;
3641 ceph_get_snap_context(snapc);
3642 }
3643 up_read(&rbd_dev->header_rwsem);
3644
3645 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003646 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003647 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003648 result = -EIO;
3649 goto err_rq;
3650 }
3651
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003652 must_be_locked =
3653 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3654 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003655 if (must_be_locked) {
3656 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003657 result = rbd_wait_state_locked(rbd_dev,
3658 !rbd_dev->opts->exclusive);
3659 if (result)
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003660 goto err_unlock;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003661 }
3662
Ilya Dryomovdfd98752018-02-06 19:26:35 +01003663 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003664 if (!img_request) {
3665 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003666 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003667 }
3668 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003669 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003670
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003671 if (op_type == OBJ_OP_DISCARD)
Ilya Dryomov5a237812018-02-06 19:26:34 +01003672 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003673 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01003674 result = rbd_img_fill_from_bio(img_request, offset, length,
3675 rq->bio);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003676 if (result)
3677 goto err_img_request;
3678
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01003679 rbd_img_request_submit(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003680 if (must_be_locked)
3681 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003682 return;
3683
3684err_img_request:
3685 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003686err_unlock:
3687 if (must_be_locked)
3688 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003689err_rq:
3690 if (result)
3691 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003692 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003693 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003694err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003695 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003696}
3697
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003698static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003699 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003700{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003701 struct request *rq = bd->rq;
3702 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003703
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003704 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003705 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003706}
3707
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003708static void rbd_free_disk(struct rbd_device *rbd_dev)
3709{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003710 blk_cleanup_queue(rbd_dev->disk->queue);
3711 blk_mq_free_tag_set(&rbd_dev->tag_set);
3712 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003713 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003714}
3715
Alex Elder788e2df2013-01-17 12:25:27 -06003716static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003717 struct ceph_object_id *oid,
3718 struct ceph_object_locator *oloc,
3719 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003720
3721{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003722 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3723 struct ceph_osd_request *req;
3724 struct page **pages;
3725 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003726 int ret;
3727
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003728 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3729 if (!req)
3730 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003731
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003732 ceph_oid_copy(&req->r_base_oid, oid);
3733 ceph_oloc_copy(&req->r_base_oloc, oloc);
3734 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003735
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003736 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06003737 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003738 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06003739
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003740 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3741 if (IS_ERR(pages)) {
3742 ret = PTR_ERR(pages);
3743 goto out_req;
3744 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003745
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003746 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3747 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3748 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003749
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003750 ceph_osdc_start_request(osdc, req, false);
3751 ret = ceph_osdc_wait_request(osdc, req);
3752 if (ret >= 0)
3753 ceph_copy_from_page_vector(pages, buf, 0, ret);
3754
3755out_req:
3756 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003757 return ret;
3758}
3759
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003760/*
Alex Elder662518b2013-05-06 09:51:29 -05003761 * Read the complete header for the given rbd device. On successful
3762 * return, the rbd_dev->header field will contain up-to-date
3763 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003764 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003765static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003766{
3767 struct rbd_image_header_ondisk *ondisk = NULL;
3768 u32 snap_count = 0;
3769 u64 names_size = 0;
3770 u32 want_count;
3771 int ret;
3772
3773 /*
3774 * The complete header will include an array of its 64-bit
3775 * snapshot ids, followed by the names of those snapshots as
3776 * a contiguous block of NUL-terminated strings. Note that
3777 * the number of snapshots could change by the time we read
3778 * it in, in which case we re-read it.
3779 */
3780 do {
3781 size_t size;
3782
3783 kfree(ondisk);
3784
3785 size = sizeof (*ondisk);
3786 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3787 size += names_size;
3788 ondisk = kmalloc(size, GFP_KERNEL);
3789 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003790 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003791
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003792 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3793 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05003794 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003795 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003796 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003797 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003798 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3799 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003800 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003801 }
3802 if (!rbd_dev_ondisk_valid(ondisk)) {
3803 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003804 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003805 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003806 }
3807
3808 names_size = le64_to_cpu(ondisk->snap_names_len);
3809 want_count = snap_count;
3810 snap_count = le32_to_cpu(ondisk->snap_count);
3811 } while (snap_count != want_count);
3812
Alex Elder662518b2013-05-06 09:51:29 -05003813 ret = rbd_header_from_disk(rbd_dev, ondisk);
3814out:
Alex Elder4156d992012-08-02 11:29:46 -05003815 kfree(ondisk);
3816
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003817 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003818}
3819
Alex Elder15228ed2013-05-01 12:43:03 -05003820/*
3821 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3822 * has disappeared from the (just updated) snapshot context.
3823 */
3824static void rbd_exists_validate(struct rbd_device *rbd_dev)
3825{
3826 u64 snap_id;
3827
3828 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3829 return;
3830
3831 snap_id = rbd_dev->spec->snap_id;
3832 if (snap_id == CEPH_NOSNAP)
3833 return;
3834
3835 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3836 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3837}
3838
Josh Durgin98752012013-08-29 17:26:31 -07003839static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3840{
3841 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07003842
3843 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02003844 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3845 * try to update its size. If REMOVING is set, updating size
3846 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07003847 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02003848 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3849 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07003850 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3851 dout("setting size to %llu sectors", (unsigned long long)size);
3852 set_capacity(rbd_dev->disk, size);
3853 revalidate_disk(rbd_dev->disk);
3854 }
3855}
3856
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003857static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003858{
Alex Eldere627db02013-05-06 07:40:30 -05003859 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003860 int ret;
3861
Alex Eldercfbf6372013-05-31 17:40:45 -05003862 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05003863 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04003864
3865 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003866 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003867 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05003868
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003869 /*
3870 * If there is a parent, see if it has disappeared due to the
3871 * mapped image getting flattened.
3872 */
3873 if (rbd_dev->parent) {
3874 ret = rbd_dev_v2_parent_info(rbd_dev);
3875 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003876 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003877 }
3878
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003879 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003880 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003881 } else {
3882 /* validate mapped snapshot's EXISTS flag */
3883 rbd_exists_validate(rbd_dev);
3884 }
Alex Elder15228ed2013-05-01 12:43:03 -05003885
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003886out:
Alex Eldercfbf6372013-05-31 17:40:45 -05003887 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003888 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07003889 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003890
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003891 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05003892}
3893
Christoph Hellwigd6296d32017-05-01 10:19:08 -06003894static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
3895 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003896{
3897 struct work_struct *work = blk_mq_rq_to_pdu(rq);
3898
3899 INIT_WORK(work, rbd_queue_workfn);
3900 return 0;
3901}
3902
Eric Biggersf363b082017-03-30 13:39:16 -07003903static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003904 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003905 .init_request = rbd_init_request,
3906};
3907
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003908static int rbd_init_disk(struct rbd_device *rbd_dev)
3909{
3910 struct gendisk *disk;
3911 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003912 u64 segment_size;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003913 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003914
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003915 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003916 disk = alloc_disk(single_major ?
3917 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3918 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003919 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003920 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003921
Alex Elderf0f8cef2012-01-29 13:57:44 -06003922 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003923 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003924 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003925 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003926 if (single_major)
3927 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003928 disk->fops = &rbd_bd_ops;
3929 disk->private_data = rbd_dev;
3930
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003931 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
3932 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003933 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003934 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003935 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003936 rbd_dev->tag_set.nr_hw_queues = 1;
3937 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
3938
3939 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
3940 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003941 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003942
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003943 q = blk_mq_init_queue(&rbd_dev->tag_set);
3944 if (IS_ERR(q)) {
3945 err = PTR_ERR(q);
3946 goto out_tag_set;
3947 }
3948
Bart Van Assche8b904b52018-03-07 17:10:10 -08003949 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03003950 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06003951
Josh Durgin029bcbd2011-07-22 11:35:23 -07003952 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003953 segment_size = rbd_obj_bytes(&rbd_dev->header);
3954 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02003955 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01003956 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01003957 blk_queue_max_segment_size(q, UINT_MAX);
Alex Elder593a9e72012-02-07 12:03:37 -06003958 blk_queue_io_min(q, segment_size);
3959 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003960
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003961 /* enable the discard support */
Bart Van Assche8b904b52018-03-07 17:10:10 -08003962 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003963 q->limits.discard_granularity = segment_size;
Jens Axboe2bb4cd52015-07-14 08:15:12 -06003964 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003965 blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003966
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003967 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01003968 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003969
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003970 /*
3971 * disk_release() expects a queue ref from add_disk() and will
3972 * put it. Hold an extra ref until add_disk() is called.
3973 */
3974 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003975 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003976 q->queuedata = rbd_dev;
3977
3978 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003979
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003980 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003981out_tag_set:
3982 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003983out_disk:
3984 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003985 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003986}
3987
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003988/*
3989 sysfs
3990*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003991
Alex Elder593a9e72012-02-07 12:03:37 -06003992static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3993{
3994 return container_of(dev, struct rbd_device, dev);
3995}
3996
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003997static ssize_t rbd_size_show(struct device *dev,
3998 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003999{
Alex Elder593a9e72012-02-07 12:03:37 -06004000 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004001
Alex Elderfc71d832013-04-26 15:44:36 -05004002 return sprintf(buf, "%llu\n",
4003 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004004}
4005
Alex Elder34b13182012-07-13 20:35:12 -05004006/*
4007 * Note this shows the features for whatever's mapped, which is not
4008 * necessarily the base image.
4009 */
4010static ssize_t rbd_features_show(struct device *dev,
4011 struct device_attribute *attr, char *buf)
4012{
4013 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4014
4015 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004016 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004017}
4018
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004019static ssize_t rbd_major_show(struct device *dev,
4020 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004021{
Alex Elder593a9e72012-02-07 12:03:37 -06004022 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004023
Alex Elderfc71d832013-04-26 15:44:36 -05004024 if (rbd_dev->major)
4025 return sprintf(buf, "%d\n", rbd_dev->major);
4026
4027 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004028}
Alex Elderfc71d832013-04-26 15:44:36 -05004029
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004030static ssize_t rbd_minor_show(struct device *dev,
4031 struct device_attribute *attr, char *buf)
4032{
4033 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4034
4035 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004036}
4037
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004038static ssize_t rbd_client_addr_show(struct device *dev,
4039 struct device_attribute *attr, char *buf)
4040{
4041 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4042 struct ceph_entity_addr *client_addr =
4043 ceph_client_addr(rbd_dev->rbd_client->client);
4044
4045 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4046 le32_to_cpu(client_addr->nonce));
4047}
4048
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004049static ssize_t rbd_client_id_show(struct device *dev,
4050 struct device_attribute *attr, char *buf)
4051{
Alex Elder593a9e72012-02-07 12:03:37 -06004052 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004053
Alex Elder1dbb4392012-01-24 10:08:37 -06004054 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004055 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004056}
4057
Mike Christie267fb902016-08-18 18:38:43 +02004058static ssize_t rbd_cluster_fsid_show(struct device *dev,
4059 struct device_attribute *attr, char *buf)
4060{
4061 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4062
4063 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4064}
4065
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004066static ssize_t rbd_config_info_show(struct device *dev,
4067 struct device_attribute *attr, char *buf)
4068{
4069 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4070
4071 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004072}
4073
4074static ssize_t rbd_pool_show(struct device *dev,
4075 struct device_attribute *attr, char *buf)
4076{
Alex Elder593a9e72012-02-07 12:03:37 -06004077 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004078
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004079 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004080}
4081
Alex Elder9bb2f332012-07-12 10:46:35 -05004082static ssize_t rbd_pool_id_show(struct device *dev,
4083 struct device_attribute *attr, char *buf)
4084{
4085 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4086
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004087 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004088 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004089}
4090
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004091static ssize_t rbd_name_show(struct device *dev,
4092 struct device_attribute *attr, char *buf)
4093{
Alex Elder593a9e72012-02-07 12:03:37 -06004094 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004095
Alex Eldera92ffdf2012-10-30 19:40:33 -05004096 if (rbd_dev->spec->image_name)
4097 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4098
4099 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004100}
4101
Alex Elder589d30e2012-07-10 20:30:11 -05004102static ssize_t rbd_image_id_show(struct device *dev,
4103 struct device_attribute *attr, char *buf)
4104{
4105 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4106
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004107 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004108}
4109
Alex Elder34b13182012-07-13 20:35:12 -05004110/*
4111 * Shows the name of the currently-mapped snapshot (or
4112 * RBD_SNAP_HEAD_NAME for the base image).
4113 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004114static ssize_t rbd_snap_show(struct device *dev,
4115 struct device_attribute *attr,
4116 char *buf)
4117{
Alex Elder593a9e72012-02-07 12:03:37 -06004118 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004119
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004120 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004121}
4122
Mike Christie92a58672016-08-18 18:38:44 +02004123static ssize_t rbd_snap_id_show(struct device *dev,
4124 struct device_attribute *attr, char *buf)
4125{
4126 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4127
4128 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4129}
4130
Alex Elder86b00e02012-10-25 23:34:42 -05004131/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004132 * For a v2 image, shows the chain of parent images, separated by empty
4133 * lines. For v1 images or if there is no parent, shows "(no parent
4134 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004135 */
4136static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004137 struct device_attribute *attr,
4138 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004139{
4140 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004141 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004142
Ilya Dryomovff961282014-07-22 21:53:07 +04004143 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004144 return sprintf(buf, "(no parent image)\n");
4145
Ilya Dryomovff961282014-07-22 21:53:07 +04004146 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4147 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004148
Ilya Dryomovff961282014-07-22 21:53:07 +04004149 count += sprintf(&buf[count], "%s"
4150 "pool_id %llu\npool_name %s\n"
4151 "image_id %s\nimage_name %s\n"
4152 "snap_id %llu\nsnap_name %s\n"
4153 "overlap %llu\n",
4154 !count ? "" : "\n", /* first? */
4155 spec->pool_id, spec->pool_name,
4156 spec->image_id, spec->image_name ?: "(unknown)",
4157 spec->snap_id, spec->snap_name,
4158 rbd_dev->parent_overlap);
4159 }
Alex Elder86b00e02012-10-25 23:34:42 -05004160
Ilya Dryomovff961282014-07-22 21:53:07 +04004161 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004162}
4163
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004164static ssize_t rbd_image_refresh(struct device *dev,
4165 struct device_attribute *attr,
4166 const char *buf,
4167 size_t size)
4168{
Alex Elder593a9e72012-02-07 12:03:37 -06004169 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004170 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004171
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004172 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004173 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004174 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004175
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004176 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004177}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004178
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004179static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05004180static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004181static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004182static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004183static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004184static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
Mike Christie267fb902016-08-18 18:38:43 +02004185static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004186static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004187static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05004188static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004189static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05004190static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004191static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4192static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Mike Christie92a58672016-08-18 18:38:44 +02004193static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05004194static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004195
4196static struct attribute *rbd_attrs[] = {
4197 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004198 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004199 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004200 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004201 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004202 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004203 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004204 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004205 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004206 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004207 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004208 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004209 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004210 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004211 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004212 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004213 NULL
4214};
4215
4216static struct attribute_group rbd_attr_group = {
4217 .attrs = rbd_attrs,
4218};
4219
4220static const struct attribute_group *rbd_attr_groups[] = {
4221 &rbd_attr_group,
4222 NULL
4223};
4224
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004225static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004226
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304227static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004228 .name = "rbd",
4229 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004230 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004231};
4232
Alex Elder8b8fb992012-10-26 17:25:24 -05004233static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4234{
4235 kref_get(&spec->kref);
4236
4237 return spec;
4238}
4239
4240static void rbd_spec_free(struct kref *kref);
4241static void rbd_spec_put(struct rbd_spec *spec)
4242{
4243 if (spec)
4244 kref_put(&spec->kref, rbd_spec_free);
4245}
4246
4247static struct rbd_spec *rbd_spec_alloc(void)
4248{
4249 struct rbd_spec *spec;
4250
4251 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4252 if (!spec)
4253 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004254
4255 spec->pool_id = CEPH_NOPOOL;
4256 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004257 kref_init(&spec->kref);
4258
Alex Elder8b8fb992012-10-26 17:25:24 -05004259 return spec;
4260}
4261
4262static void rbd_spec_free(struct kref *kref)
4263{
4264 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4265
4266 kfree(spec->pool_name);
4267 kfree(spec->image_id);
4268 kfree(spec->image_name);
4269 kfree(spec->snap_name);
4270 kfree(spec);
4271}
4272
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004273static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004274{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004275 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004276 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004277
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004278 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004279 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004280 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004281
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004282 rbd_put_client(rbd_dev->rbd_client);
4283 rbd_spec_put(rbd_dev->spec);
4284 kfree(rbd_dev->opts);
4285 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004286}
4287
4288static void rbd_dev_release(struct device *dev)
4289{
4290 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4291 bool need_put = !!rbd_dev->opts;
4292
4293 if (need_put) {
4294 destroy_workqueue(rbd_dev->task_wq);
4295 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4296 }
4297
4298 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004299
4300 /*
4301 * This is racy, but way better than putting module outside of
4302 * the release callback. The race window is pretty small, so
4303 * doing something similar to dm (dm-builtin.c) is overkill.
4304 */
4305 if (need_put)
4306 module_put(THIS_MODULE);
4307}
4308
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004309static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4310 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004311{
4312 struct rbd_device *rbd_dev;
4313
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004314 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004315 if (!rbd_dev)
4316 return NULL;
4317
4318 spin_lock_init(&rbd_dev->lock);
4319 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004320 init_rwsem(&rbd_dev->header_rwsem);
4321
Ilya Dryomov7e973322017-01-25 18:16:22 +01004322 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004323 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004324 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004325
Ilya Dryomov99d16942016-08-12 16:11:41 +02004326 mutex_init(&rbd_dev->watch_mutex);
4327 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4328 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4329
Ilya Dryomoved95b212016-08-12 16:40:02 +02004330 init_rwsem(&rbd_dev->lock_rwsem);
4331 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4332 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4333 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4334 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4335 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4336 init_waitqueue_head(&rbd_dev->lock_waitq);
4337
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004338 rbd_dev->dev.bus = &rbd_bus_type;
4339 rbd_dev->dev.type = &rbd_device_type;
4340 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004341 device_initialize(&rbd_dev->dev);
4342
Alex Elderc53d5892012-10-25 23:34:42 -05004343 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004344 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004345
Alex Elderc53d5892012-10-25 23:34:42 -05004346 return rbd_dev;
4347}
4348
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004349/*
4350 * Create a mapping rbd_dev.
4351 */
4352static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4353 struct rbd_spec *spec,
4354 struct rbd_options *opts)
4355{
4356 struct rbd_device *rbd_dev;
4357
4358 rbd_dev = __rbd_dev_create(rbdc, spec);
4359 if (!rbd_dev)
4360 return NULL;
4361
4362 rbd_dev->opts = opts;
4363
4364 /* get an id and fill in device name */
4365 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4366 minor_to_rbd_dev_id(1 << MINORBITS),
4367 GFP_KERNEL);
4368 if (rbd_dev->dev_id < 0)
4369 goto fail_rbd_dev;
4370
4371 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4372 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4373 rbd_dev->name);
4374 if (!rbd_dev->task_wq)
4375 goto fail_dev_id;
4376
4377 /* we have a ref from do_rbd_add() */
4378 __module_get(THIS_MODULE);
4379
4380 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4381 return rbd_dev;
4382
4383fail_dev_id:
4384 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4385fail_rbd_dev:
4386 rbd_dev_free(rbd_dev);
4387 return NULL;
4388}
4389
Alex Elderc53d5892012-10-25 23:34:42 -05004390static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4391{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004392 if (rbd_dev)
4393 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004394}
4395
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004396/*
Alex Elder9d475de2012-07-03 16:01:19 -05004397 * Get the size and object order for an image snapshot, or if
4398 * snap_id is CEPH_NOSNAP, gets this information for the base
4399 * image.
4400 */
4401static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4402 u8 *order, u64 *snap_size)
4403{
4404 __le64 snapid = cpu_to_le64(snap_id);
4405 int ret;
4406 struct {
4407 u8 order;
4408 __le64 size;
4409 } __attribute__ ((packed)) size_buf = { 0 };
4410
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004411 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4412 &rbd_dev->header_oloc, "get_size",
4413 &snapid, sizeof(snapid),
4414 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004415 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004416 if (ret < 0)
4417 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004418 if (ret < sizeof (size_buf))
4419 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004420
Josh Durginc3545572013-08-28 17:08:10 -07004421 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004422 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004423 dout(" order %u", (unsigned int)*order);
4424 }
Alex Elder9d475de2012-07-03 16:01:19 -05004425 *snap_size = le64_to_cpu(size_buf.size);
4426
Josh Durginc3545572013-08-28 17:08:10 -07004427 dout(" snap_id 0x%016llx snap_size = %llu\n",
4428 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004429 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004430
4431 return 0;
4432}
4433
4434static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4435{
4436 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4437 &rbd_dev->header.obj_order,
4438 &rbd_dev->header.image_size);
4439}
4440
Alex Elder1e130192012-07-03 16:01:19 -05004441static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4442{
4443 void *reply_buf;
4444 int ret;
4445 void *p;
4446
4447 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4448 if (!reply_buf)
4449 return -ENOMEM;
4450
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004451 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4452 &rbd_dev->header_oloc, "get_object_prefix",
4453 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004454 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004455 if (ret < 0)
4456 goto out;
4457
4458 p = reply_buf;
4459 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004460 p + ret, NULL, GFP_NOIO);
4461 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004462
4463 if (IS_ERR(rbd_dev->header.object_prefix)) {
4464 ret = PTR_ERR(rbd_dev->header.object_prefix);
4465 rbd_dev->header.object_prefix = NULL;
4466 } else {
4467 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4468 }
Alex Elder1e130192012-07-03 16:01:19 -05004469out:
4470 kfree(reply_buf);
4471
4472 return ret;
4473}
4474
Alex Elderb1b54022012-07-03 16:01:19 -05004475static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4476 u64 *snap_features)
4477{
4478 __le64 snapid = cpu_to_le64(snap_id);
4479 struct {
4480 __le64 features;
4481 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004482 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004483 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004484 int ret;
4485
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004486 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4487 &rbd_dev->header_oloc, "get_features",
4488 &snapid, sizeof(snapid),
4489 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004490 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004491 if (ret < 0)
4492 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004493 if (ret < sizeof (features_buf))
4494 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004495
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004496 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4497 if (unsup) {
4498 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4499 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004500 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004501 }
Alex Elderd8891402012-10-09 13:50:17 -07004502
Alex Elderb1b54022012-07-03 16:01:19 -05004503 *snap_features = le64_to_cpu(features_buf.features);
4504
4505 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004506 (unsigned long long)snap_id,
4507 (unsigned long long)*snap_features,
4508 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004509
4510 return 0;
4511}
4512
4513static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4514{
4515 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4516 &rbd_dev->header.features);
4517}
4518
Alex Elder86b00e02012-10-25 23:34:42 -05004519static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4520{
4521 struct rbd_spec *parent_spec;
4522 size_t size;
4523 void *reply_buf = NULL;
4524 __le64 snapid;
4525 void *p;
4526 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05004527 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004528 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004529 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004530 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05004531 int ret;
4532
4533 parent_spec = rbd_spec_alloc();
4534 if (!parent_spec)
4535 return -ENOMEM;
4536
4537 size = sizeof (__le64) + /* pool_id */
4538 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
4539 sizeof (__le64) + /* snap_id */
4540 sizeof (__le64); /* overlap */
4541 reply_buf = kmalloc(size, GFP_KERNEL);
4542 if (!reply_buf) {
4543 ret = -ENOMEM;
4544 goto out_err;
4545 }
4546
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04004547 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004548 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4549 &rbd_dev->header_oloc, "get_parent",
4550 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004551 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05004552 if (ret < 0)
4553 goto out_err;
4554
Alex Elder86b00e02012-10-25 23:34:42 -05004555 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004556 end = reply_buf + ret;
4557 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05004558 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05004559 if (pool_id == CEPH_NOPOOL) {
4560 /*
4561 * Either the parent never existed, or we have
4562 * record of it but the image got flattened so it no
4563 * longer has a parent. When the parent of a
4564 * layered image disappears we immediately set the
4565 * overlap to 0. The effect of this is that all new
4566 * requests will be treated as if the image had no
4567 * parent.
4568 */
4569 if (rbd_dev->parent_overlap) {
4570 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004571 rbd_dev_parent_put(rbd_dev);
4572 pr_info("%s: clone image has been flattened\n",
4573 rbd_dev->disk->disk_name);
4574 }
4575
Alex Elder86b00e02012-10-25 23:34:42 -05004576 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004577 }
Alex Elder86b00e02012-10-25 23:34:42 -05004578
Alex Elder0903e872012-11-14 12:25:19 -06004579 /* The ceph file layout needs to fit pool id in 32 bits */
4580
4581 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05004582 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004583 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05004584 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004585 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004586 }
Alex Elder0903e872012-11-14 12:25:19 -06004587
Alex Elder979ed482012-11-01 08:39:26 -05004588 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05004589 if (IS_ERR(image_id)) {
4590 ret = PTR_ERR(image_id);
4591 goto out_err;
4592 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004593 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05004594 ceph_decode_64_safe(&p, end, overlap, out_err);
4595
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004596 /*
4597 * The parent won't change (except when the clone is
4598 * flattened, already handled that). So we only need to
4599 * record the parent spec we have not already done so.
4600 */
4601 if (!rbd_dev->parent_spec) {
4602 parent_spec->pool_id = pool_id;
4603 parent_spec->image_id = image_id;
4604 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05004605 rbd_dev->parent_spec = parent_spec;
4606 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04004607 } else {
4608 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004609 }
4610
4611 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004612 * We always update the parent overlap. If it's zero we issue
4613 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004614 */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004615 if (!overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004616 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004617 /* refresh, careful to warn just once */
4618 if (rbd_dev->parent_overlap)
4619 rbd_warn(rbd_dev,
4620 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004621 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004622 /* initial probe */
4623 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004624 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004625 }
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004626 rbd_dev->parent_overlap = overlap;
4627
Alex Elder86b00e02012-10-25 23:34:42 -05004628out:
4629 ret = 0;
4630out_err:
4631 kfree(reply_buf);
4632 rbd_spec_put(parent_spec);
4633
4634 return ret;
4635}
4636
Alex Eldercc070d52013-04-21 12:14:45 -05004637static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4638{
4639 struct {
4640 __le64 stripe_unit;
4641 __le64 stripe_count;
4642 } __attribute__ ((packed)) striping_info_buf = { 0 };
4643 size_t size = sizeof (striping_info_buf);
4644 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05004645 int ret;
4646
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004647 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4648 &rbd_dev->header_oloc, "get_stripe_unit_count",
4649 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004650 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4651 if (ret < 0)
4652 return ret;
4653 if (ret < size)
4654 return -ERANGE;
4655
Alex Eldercc070d52013-04-21 12:14:45 -05004656 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01004657 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
4658 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05004659 return 0;
4660}
4661
Ilya Dryomov7e973322017-01-25 18:16:22 +01004662static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4663{
4664 __le64 data_pool_id;
4665 int ret;
4666
4667 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4668 &rbd_dev->header_oloc, "get_data_pool",
4669 NULL, 0, &data_pool_id, sizeof(data_pool_id));
4670 if (ret < 0)
4671 return ret;
4672 if (ret < sizeof(data_pool_id))
4673 return -EBADMSG;
4674
4675 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4676 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4677 return 0;
4678}
4679
Alex Elder9e15b772012-10-30 19:40:33 -05004680static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4681{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004682 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05004683 size_t image_id_size;
4684 char *image_id;
4685 void *p;
4686 void *end;
4687 size_t size;
4688 void *reply_buf = NULL;
4689 size_t len = 0;
4690 char *image_name = NULL;
4691 int ret;
4692
4693 rbd_assert(!rbd_dev->spec->image_name);
4694
Alex Elder69e7a022012-11-01 08:39:26 -05004695 len = strlen(rbd_dev->spec->image_id);
4696 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004697 image_id = kmalloc(image_id_size, GFP_KERNEL);
4698 if (!image_id)
4699 return NULL;
4700
4701 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004702 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004703 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004704
4705 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4706 reply_buf = kmalloc(size, GFP_KERNEL);
4707 if (!reply_buf)
4708 goto out;
4709
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004710 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4711 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
4712 "dir_get_name", image_id, image_id_size,
4713 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05004714 if (ret < 0)
4715 goto out;
4716 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004717 end = reply_buf + ret;
4718
Alex Elder9e15b772012-10-30 19:40:33 -05004719 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4720 if (IS_ERR(image_name))
4721 image_name = NULL;
4722 else
4723 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4724out:
4725 kfree(reply_buf);
4726 kfree(image_id);
4727
4728 return image_name;
4729}
4730
Alex Elder2ad3d712013-04-30 00:44:33 -05004731static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4732{
4733 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4734 const char *snap_name;
4735 u32 which = 0;
4736
4737 /* Skip over names until we find the one we are looking for */
4738
4739 snap_name = rbd_dev->header.snap_names;
4740 while (which < snapc->num_snaps) {
4741 if (!strcmp(name, snap_name))
4742 return snapc->snaps[which];
4743 snap_name += strlen(snap_name) + 1;
4744 which++;
4745 }
4746 return CEPH_NOSNAP;
4747}
4748
4749static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4750{
4751 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4752 u32 which;
4753 bool found = false;
4754 u64 snap_id;
4755
4756 for (which = 0; !found && which < snapc->num_snaps; which++) {
4757 const char *snap_name;
4758
4759 snap_id = snapc->snaps[which];
4760 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07004761 if (IS_ERR(snap_name)) {
4762 /* ignore no-longer existing snapshots */
4763 if (PTR_ERR(snap_name) == -ENOENT)
4764 continue;
4765 else
4766 break;
4767 }
Alex Elder2ad3d712013-04-30 00:44:33 -05004768 found = !strcmp(name, snap_name);
4769 kfree(snap_name);
4770 }
4771 return found ? snap_id : CEPH_NOSNAP;
4772}
4773
4774/*
4775 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4776 * no snapshot by that name is found, or if an error occurs.
4777 */
4778static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4779{
4780 if (rbd_dev->image_format == 1)
4781 return rbd_v1_snap_id_by_name(rbd_dev, name);
4782
4783 return rbd_v2_snap_id_by_name(rbd_dev, name);
4784}
4785
Alex Elder9e15b772012-10-30 19:40:33 -05004786/*
Ilya Dryomov04077592014-07-23 17:11:20 +04004787 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05004788 */
Ilya Dryomov04077592014-07-23 17:11:20 +04004789static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
4790{
4791 struct rbd_spec *spec = rbd_dev->spec;
4792
4793 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
4794 rbd_assert(spec->image_id && spec->image_name);
4795 rbd_assert(spec->snap_name);
4796
4797 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4798 u64 snap_id;
4799
4800 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4801 if (snap_id == CEPH_NOSNAP)
4802 return -ENOENT;
4803
4804 spec->snap_id = snap_id;
4805 } else {
4806 spec->snap_id = CEPH_NOSNAP;
4807 }
4808
4809 return 0;
4810}
4811
4812/*
4813 * A parent image will have all ids but none of the names.
4814 *
4815 * All names in an rbd spec are dynamically allocated. It's OK if we
4816 * can't figure out the name for an image id.
4817 */
4818static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05004819{
Alex Elder2e9f7f12013-04-26 09:43:48 -05004820 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4821 struct rbd_spec *spec = rbd_dev->spec;
4822 const char *pool_name;
4823 const char *image_name;
4824 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004825 int ret;
4826
Ilya Dryomov04077592014-07-23 17:11:20 +04004827 rbd_assert(spec->pool_id != CEPH_NOPOOL);
4828 rbd_assert(spec->image_id);
4829 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05004830
Alex Elder2e9f7f12013-04-26 09:43:48 -05004831 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05004832
Alex Elder2e9f7f12013-04-26 09:43:48 -05004833 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4834 if (!pool_name) {
4835 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05004836 return -EIO;
4837 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05004838 pool_name = kstrdup(pool_name, GFP_KERNEL);
4839 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05004840 return -ENOMEM;
4841
4842 /* Fetch the image name; tolerate failure here */
4843
Alex Elder2e9f7f12013-04-26 09:43:48 -05004844 image_name = rbd_dev_image_name(rbd_dev);
4845 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05004846 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05004847
Ilya Dryomov04077592014-07-23 17:11:20 +04004848 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05004849
Alex Elder2e9f7f12013-04-26 09:43:48 -05004850 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07004851 if (IS_ERR(snap_name)) {
4852 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004853 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05004854 }
4855
4856 spec->pool_name = pool_name;
4857 spec->image_name = image_name;
4858 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004859
4860 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04004861
Alex Elder9e15b772012-10-30 19:40:33 -05004862out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05004863 kfree(image_name);
4864 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004865 return ret;
4866}
4867
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004868static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05004869{
4870 size_t size;
4871 int ret;
4872 void *reply_buf;
4873 void *p;
4874 void *end;
4875 u64 seq;
4876 u32 snap_count;
4877 struct ceph_snap_context *snapc;
4878 u32 i;
4879
4880 /*
4881 * We'll need room for the seq value (maximum snapshot id),
4882 * snapshot count, and array of that many snapshot ids.
4883 * For now we have a fixed upper limit on the number we're
4884 * prepared to receive.
4885 */
4886 size = sizeof (__le64) + sizeof (__le32) +
4887 RBD_MAX_SNAP_COUNT * sizeof (__le64);
4888 reply_buf = kzalloc(size, GFP_KERNEL);
4889 if (!reply_buf)
4890 return -ENOMEM;
4891
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004892 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4893 &rbd_dev->header_oloc, "get_snapcontext",
4894 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004895 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05004896 if (ret < 0)
4897 goto out;
4898
Alex Elder35d489f2012-07-03 16:01:19 -05004899 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004900 end = reply_buf + ret;
4901 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05004902 ceph_decode_64_safe(&p, end, seq, out);
4903 ceph_decode_32_safe(&p, end, snap_count, out);
4904
4905 /*
4906 * Make sure the reported number of snapshot ids wouldn't go
4907 * beyond the end of our buffer. But before checking that,
4908 * make sure the computed size of the snapshot context we
4909 * allocate is representable in a size_t.
4910 */
4911 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4912 / sizeof (u64)) {
4913 ret = -EINVAL;
4914 goto out;
4915 }
4916 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4917 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05004918 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05004919
Alex Elder812164f82013-04-30 00:44:32 -05004920 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05004921 if (!snapc) {
4922 ret = -ENOMEM;
4923 goto out;
4924 }
Alex Elder35d489f2012-07-03 16:01:19 -05004925 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05004926 for (i = 0; i < snap_count; i++)
4927 snapc->snaps[i] = ceph_decode_64(&p);
4928
Alex Elder49ece552013-05-06 08:37:00 -05004929 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05004930 rbd_dev->header.snapc = snapc;
4931
4932 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05004933 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05004934out:
4935 kfree(reply_buf);
4936
Alex Elder57385b52013-04-21 12:14:45 -05004937 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05004938}
4939
Alex Elder54cac612013-04-30 00:44:33 -05004940static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4941 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004942{
4943 size_t size;
4944 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05004945 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004946 int ret;
4947 void *p;
4948 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004949 char *snap_name;
4950
4951 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4952 reply_buf = kmalloc(size, GFP_KERNEL);
4953 if (!reply_buf)
4954 return ERR_PTR(-ENOMEM);
4955
Alex Elder54cac612013-04-30 00:44:33 -05004956 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004957 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4958 &rbd_dev->header_oloc, "get_snapshot_name",
4959 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004960 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05004961 if (ret < 0) {
4962 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004963 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05004964 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004965
4966 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004967 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05004968 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05004969 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004970 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004971
Alex Elderf40eb342013-04-25 15:09:42 -05004972 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05004973 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004974out:
4975 kfree(reply_buf);
4976
Alex Elderf40eb342013-04-25 15:09:42 -05004977 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004978}
4979
Alex Elder2df3fac2013-05-06 09:51:30 -05004980static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05004981{
Alex Elder2df3fac2013-05-06 09:51:30 -05004982 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05004983 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05004984
Josh Durgin1617e402013-06-12 14:43:10 -07004985 ret = rbd_dev_v2_image_size(rbd_dev);
4986 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05004987 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07004988
Alex Elder2df3fac2013-05-06 09:51:30 -05004989 if (first_time) {
4990 ret = rbd_dev_v2_header_onetime(rbd_dev);
4991 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05004992 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05004993 }
4994
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004995 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03004996 if (ret && first_time) {
4997 kfree(rbd_dev->header.object_prefix);
4998 rbd_dev->header.object_prefix = NULL;
4999 }
Alex Elder117973f2012-08-31 17:29:55 -05005000
5001 return ret;
5002}
5003
Ilya Dryomova720ae02014-07-23 17:11:19 +04005004static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5005{
5006 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5007
5008 if (rbd_dev->image_format == 1)
5009 return rbd_dev_v1_header_info(rbd_dev);
5010
5011 return rbd_dev_v2_header_info(rbd_dev);
5012}
5013
Alex Elder1ddbe942012-01-29 13:57:44 -06005014/*
Alex Eldere28fff262012-02-02 08:13:30 -06005015 * Skips over white space at *buf, and updates *buf to point to the
5016 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005017 * the token (string of non-white space characters) found. Note
5018 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005019 */
5020static inline size_t next_token(const char **buf)
5021{
5022 /*
5023 * These are the characters that produce nonzero for
5024 * isspace() in the "C" and "POSIX" locales.
5025 */
5026 const char *spaces = " \f\n\r\t\v";
5027
5028 *buf += strspn(*buf, spaces); /* Find start of token */
5029
5030 return strcspn(*buf, spaces); /* Return token length */
5031}
5032
5033/*
Alex Elderea3352f2012-07-09 21:04:23 -05005034 * Finds the next token in *buf, dynamically allocates a buffer big
5035 * enough to hold a copy of it, and copies the token into the new
5036 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5037 * that a duplicate buffer is created even for a zero-length token.
5038 *
5039 * Returns a pointer to the newly-allocated duplicate, or a null
5040 * pointer if memory for the duplicate was not available. If
5041 * the lenp argument is a non-null pointer, the length of the token
5042 * (not including the '\0') is returned in *lenp.
5043 *
5044 * If successful, the *buf pointer will be updated to point beyond
5045 * the end of the found token.
5046 *
5047 * Note: uses GFP_KERNEL for allocation.
5048 */
5049static inline char *dup_token(const char **buf, size_t *lenp)
5050{
5051 char *dup;
5052 size_t len;
5053
5054 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005055 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005056 if (!dup)
5057 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005058 *(dup + len) = '\0';
5059 *buf += len;
5060
5061 if (lenp)
5062 *lenp = len;
5063
5064 return dup;
5065}
5066
5067/*
Alex Elder859c31d2012-10-25 23:34:42 -05005068 * Parse the options provided for an "rbd add" (i.e., rbd image
5069 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5070 * and the data written is passed here via a NUL-terminated buffer.
5071 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005072 *
Alex Elder859c31d2012-10-25 23:34:42 -05005073 * The information extracted from these options is recorded in
5074 * the other parameters which return dynamically-allocated
5075 * structures:
5076 * ceph_opts
5077 * The address of a pointer that will refer to a ceph options
5078 * structure. Caller must release the returned pointer using
5079 * ceph_destroy_options() when it is no longer needed.
5080 * rbd_opts
5081 * Address of an rbd options pointer. Fully initialized by
5082 * this function; caller must release with kfree().
5083 * spec
5084 * Address of an rbd image specification pointer. Fully
5085 * initialized by this function based on parsed options.
5086 * Caller must release with rbd_spec_put().
5087 *
5088 * The options passed take this form:
5089 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5090 * where:
5091 * <mon_addrs>
5092 * A comma-separated list of one or more monitor addresses.
5093 * A monitor address is an ip address, optionally followed
5094 * by a port number (separated by a colon).
5095 * I.e.: ip1[:port1][,ip2[:port2]...]
5096 * <options>
5097 * A comma-separated list of ceph and/or rbd options.
5098 * <pool_name>
5099 * The name of the rados pool containing the rbd image.
5100 * <image_name>
5101 * The name of the image in that pool to map.
5102 * <snap_id>
5103 * An optional snapshot id. If provided, the mapping will
5104 * present data from the image at the time that snapshot was
5105 * created. The image head is used if no snapshot id is
5106 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005107 */
Alex Elder859c31d2012-10-25 23:34:42 -05005108static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005109 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005110 struct rbd_options **opts,
5111 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005112{
Alex Elderd22f76e2012-07-12 10:46:35 -05005113 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005114 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005115 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005116 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005117 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05005118 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005119 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005120 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005121 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005122
5123 /* The first four tokens are required */
5124
Alex Elder7ef32142012-02-02 08:13:30 -06005125 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005126 if (!len) {
5127 rbd_warn(NULL, "no monitor address(es) provided");
5128 return -EINVAL;
5129 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005130 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005131 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005132 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005133
Alex Elderdc79b112012-10-25 23:34:41 -05005134 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005135 options = dup_token(&buf, NULL);
5136 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005137 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005138 if (!*options) {
5139 rbd_warn(NULL, "no options provided");
5140 goto out_err;
5141 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005142
Alex Elder859c31d2012-10-25 23:34:42 -05005143 spec = rbd_spec_alloc();
5144 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005145 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005146
5147 spec->pool_name = dup_token(&buf, NULL);
5148 if (!spec->pool_name)
5149 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005150 if (!*spec->pool_name) {
5151 rbd_warn(NULL, "no pool name provided");
5152 goto out_err;
5153 }
Alex Eldere28fff262012-02-02 08:13:30 -06005154
Alex Elder69e7a022012-11-01 08:39:26 -05005155 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05005156 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005157 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005158 if (!*spec->image_name) {
5159 rbd_warn(NULL, "no image name provided");
5160 goto out_err;
5161 }
Alex Eldere28fff262012-02-02 08:13:30 -06005162
Alex Elderf28e5652012-10-25 23:34:41 -05005163 /*
5164 * Snapshot name is optional; default is to use "-"
5165 * (indicating the head/no snapshot).
5166 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005167 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005168 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005169 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5170 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005171 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005172 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005173 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005174 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005175 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5176 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005177 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005178 *(snap_name + len) = '\0';
5179 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005180
Alex Elder0ddebc02012-10-25 23:34:41 -05005181 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005182
Alex Elder4e9afeb2012-10-25 23:34:41 -05005183 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
5184 if (!rbd_opts)
5185 goto out_mem;
5186
5187 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Ilya Dryomovb5584182015-06-23 16:21:19 +03005188 rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov80de1912016-09-20 14:23:17 +02005189 rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
Ilya Dryomove010dd02017-04-13 12:17:39 +02005190 rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005191
Alex Elder859c31d2012-10-25 23:34:42 -05005192 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05005193 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05005194 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05005195 if (IS_ERR(copts)) {
5196 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005197 goto out_err;
5198 }
Alex Elder859c31d2012-10-25 23:34:42 -05005199 kfree(options);
5200
5201 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005202 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05005203 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005204
Alex Elderdc79b112012-10-25 23:34:41 -05005205 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005206out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005207 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005208out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05005209 kfree(rbd_opts);
5210 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005211 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005212
Alex Elderdc79b112012-10-25 23:34:41 -05005213 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005214}
5215
Ilya Dryomove010dd02017-04-13 12:17:39 +02005216static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5217{
5218 down_write(&rbd_dev->lock_rwsem);
5219 if (__rbd_is_lock_owner(rbd_dev))
5220 rbd_unlock(rbd_dev);
5221 up_write(&rbd_dev->lock_rwsem);
5222}
5223
5224static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5225{
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005226 int ret;
5227
Ilya Dryomove010dd02017-04-13 12:17:39 +02005228 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5229 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5230 return -EINVAL;
5231 }
5232
5233 /* FIXME: "rbd map --exclusive" should be in interruptible */
5234 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005235 ret = rbd_wait_state_locked(rbd_dev, true);
Ilya Dryomove010dd02017-04-13 12:17:39 +02005236 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005237 if (ret) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02005238 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5239 return -EROFS;
5240 }
5241
5242 return 0;
5243}
5244
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005245/*
Alex Elder589d30e2012-07-10 20:30:11 -05005246 * An rbd format 2 image has a unique identifier, distinct from the
5247 * name given to it by the user. Internally, that identifier is
5248 * what's used to specify the names of objects related to the image.
5249 *
5250 * A special "rbd id" object is used to map an rbd image name to its
5251 * id. If that object doesn't exist, then there is no v2 rbd image
5252 * with the supplied name.
5253 *
5254 * This function will record the given rbd_dev's image_id field if
5255 * it can be determined, and in that case will return 0. If any
5256 * errors occur a negative errno will be returned and the rbd_dev's
5257 * image_id field will be unchanged (and should be NULL).
5258 */
5259static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5260{
5261 int ret;
5262 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005263 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005264 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005265 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005266
Alex Elder589d30e2012-07-10 20:30:11 -05005267 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005268 * When probing a parent image, the image id is already
5269 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005270 * need to fetch the image id again in this case. We
5271 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005272 */
Alex Elderc0fba362013-04-25 23:15:08 -05005273 if (rbd_dev->spec->image_id) {
5274 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5275
Alex Elder2c0d0a12012-10-30 19:40:33 -05005276 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005277 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005278
5279 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005280 * First, see if the format 2 image id file exists, and if
5281 * so, get the image's persistent id from it.
5282 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005283 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5284 rbd_dev->spec->image_name);
5285 if (ret)
5286 return ret;
5287
5288 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005289
5290 /* Response will be an encoded string, which includes a length */
5291
5292 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5293 response = kzalloc(size, GFP_NOIO);
5294 if (!response) {
5295 ret = -ENOMEM;
5296 goto out;
5297 }
5298
Alex Elderc0fba362013-04-25 23:15:08 -05005299 /* If it doesn't exist we'll assume it's a format 1 image */
5300
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005301 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5302 "get_id", NULL, 0,
5303 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005304 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005305 if (ret == -ENOENT) {
5306 image_id = kstrdup("", GFP_KERNEL);
5307 ret = image_id ? 0 : -ENOMEM;
5308 if (!ret)
5309 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005310 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005311 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005312
Alex Elderc0fba362013-04-25 23:15:08 -05005313 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005314 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005315 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005316 if (!ret)
5317 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005318 }
5319
5320 if (!ret) {
5321 rbd_dev->spec->image_id = image_id;
5322 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005323 }
5324out:
5325 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005326 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005327 return ret;
5328}
5329
Alex Elder3abef3b2013-05-13 20:35:37 -05005330/*
5331 * Undo whatever state changes are made by v1 or v2 header info
5332 * call.
5333 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005334static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5335{
5336 struct rbd_image_header *header;
5337
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005338 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005339
5340 /* Free dynamic fields from the header, then zero it out */
5341
5342 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005343 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005344 kfree(header->snap_sizes);
5345 kfree(header->snap_names);
5346 kfree(header->object_prefix);
5347 memset(header, 0, sizeof (*header));
5348}
5349
Alex Elder2df3fac2013-05-06 09:51:30 -05005350static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005351{
5352 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005353
Alex Elder1e130192012-07-03 16:01:19 -05005354 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005355 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005356 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005357
Alex Elder2df3fac2013-05-06 09:51:30 -05005358 /*
5359 * Get the and check features for the image. Currently the
5360 * features are assumed to never change.
5361 */
Alex Elderb1b54022012-07-03 16:01:19 -05005362 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005363 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005364 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005365
Alex Eldercc070d52013-04-21 12:14:45 -05005366 /* If the image supports fancy striping, get its parameters */
5367
5368 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5369 ret = rbd_dev_v2_striping_info(rbd_dev);
5370 if (ret < 0)
5371 goto out_err;
5372 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005373
Ilya Dryomov7e973322017-01-25 18:16:22 +01005374 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5375 ret = rbd_dev_v2_data_pool(rbd_dev);
5376 if (ret)
5377 goto out_err;
5378 }
5379
Ilya Dryomov263423f2017-01-25 18:16:22 +01005380 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005381 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005382
Alex Elder9d475de2012-07-03 16:01:19 -05005383out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005384 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005385 kfree(rbd_dev->header.object_prefix);
5386 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005387 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005388}
5389
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005390/*
5391 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5392 * rbd_dev_image_probe() recursion depth, which means it's also the
5393 * length of the already discovered part of the parent chain.
5394 */
5395static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005396{
Alex Elder2f82ee52012-10-30 19:40:33 -05005397 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005398 int ret;
5399
5400 if (!rbd_dev->parent_spec)
5401 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005402
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005403 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5404 pr_info("parent chain is too long (%d)\n", depth);
5405 ret = -EINVAL;
5406 goto out_err;
5407 }
5408
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005409 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005410 if (!parent) {
5411 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005412 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005413 }
5414
5415 /*
5416 * Images related by parent/child relationships always share
5417 * rbd_client and spec/parent_spec, so bump their refcounts.
5418 */
5419 __rbd_get_client(rbd_dev->rbd_client);
5420 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005421
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005422 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005423 if (ret < 0)
5424 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005425
Alex Elder124afba2013-04-26 15:44:36 -05005426 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005427 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005428 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005429
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005430out_err:
5431 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005432 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005433 return ret;
5434}
5435
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005436static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5437{
5438 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5439 rbd_dev_mapping_clear(rbd_dev);
5440 rbd_free_disk(rbd_dev);
5441 if (!single_major)
5442 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5443}
5444
Ilya Dryomov811c6682016-04-15 16:22:16 +02005445/*
5446 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5447 * upon return.
5448 */
Alex Elder200a6a82013-04-28 23:32:34 -05005449static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005450{
Alex Elder83a06262012-10-30 15:47:17 -05005451 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005452
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005453 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005454
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005455 if (!single_major) {
5456 ret = register_blkdev(0, rbd_dev->name);
5457 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005458 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005459
5460 rbd_dev->major = ret;
5461 rbd_dev->minor = 0;
5462 } else {
5463 rbd_dev->major = rbd_major;
5464 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5465 }
Alex Elder83a06262012-10-30 15:47:17 -05005466
5467 /* Set up the blkdev mapping. */
5468
5469 ret = rbd_init_disk(rbd_dev);
5470 if (ret)
5471 goto err_out_blkdev;
5472
Alex Elderf35a4de2013-05-06 09:51:29 -05005473 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005474 if (ret)
5475 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005476
Alex Elderf35a4de2013-05-06 09:51:29 -05005477 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005478 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005479
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005480 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005481 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005482 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005483
Alex Elder129b79d2013-04-26 15:44:36 -05005484 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005485 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005486 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005487
Alex Elderf35a4de2013-05-06 09:51:29 -05005488err_out_mapping:
5489 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005490err_out_disk:
5491 rbd_free_disk(rbd_dev);
5492err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005493 if (!single_major)
5494 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005495err_out_unlock:
5496 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005497 return ret;
5498}
5499
Alex Elder332bb122013-04-27 09:59:30 -05005500static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5501{
5502 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005503 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005504
5505 /* Record the header object name for this rbd image. */
5506
5507 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005508 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005509 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5510 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005511 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005512 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5513 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005514
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005515 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005516}
5517
Alex Elder200a6a82013-04-28 23:32:34 -05005518static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5519{
Alex Elder6fd48b32013-04-28 23:32:34 -05005520 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005521 if (rbd_dev->opts)
5522 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005523 rbd_dev->image_format = 0;
5524 kfree(rbd_dev->spec->image_id);
5525 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005526}
5527
Alex Eldera30b71b2012-07-10 20:30:11 -05005528/*
5529 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005530 * device. If this image is the one being mapped (i.e., not a
5531 * parent), initiate a watch on its header object before using that
5532 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005533 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005534static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005535{
5536 int ret;
5537
5538 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005539 * Get the id from the image id object. Unless there's an
5540 * error, rbd_dev->spec->image_id will be filled in with
5541 * a dynamically-allocated string, and rbd_dev->image_format
5542 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005543 */
5544 ret = rbd_dev_image_id(rbd_dev);
5545 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005546 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005547
Alex Elder332bb122013-04-27 09:59:30 -05005548 ret = rbd_dev_header_name(rbd_dev);
5549 if (ret)
5550 goto err_out_format;
5551
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005552 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005553 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005554 if (ret) {
5555 if (ret == -ENOENT)
5556 pr_info("image %s/%s does not exist\n",
5557 rbd_dev->spec->pool_name,
5558 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005559 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005560 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005561 }
Alex Elderb644de22013-04-27 09:59:31 -05005562
Ilya Dryomova720ae02014-07-23 17:11:19 +04005563 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005564 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005565 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005566
Ilya Dryomov04077592014-07-23 17:11:20 +04005567 /*
5568 * If this image is the one being mapped, we have pool name and
5569 * id, image name and id, and snap name - need to fill snap id.
5570 * Otherwise this is a parent image, identified by pool, image
5571 * and snap ids - need to fill in names for those ids.
5572 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005573 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005574 ret = rbd_spec_fill_snap_id(rbd_dev);
5575 else
5576 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005577 if (ret) {
5578 if (ret == -ENOENT)
5579 pr_info("snap %s/%s@%s does not exist\n",
5580 rbd_dev->spec->pool_name,
5581 rbd_dev->spec->image_name,
5582 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005583 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005584 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005585
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005586 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5587 ret = rbd_dev_v2_parent_info(rbd_dev);
5588 if (ret)
5589 goto err_out_probe;
5590
5591 /*
5592 * Need to warn users if this image is the one being
5593 * mapped and has a parent.
5594 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005595 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005596 rbd_warn(rbd_dev,
5597 "WARNING: kernel layering is EXPERIMENTAL!");
5598 }
5599
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005600 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005601 if (ret)
5602 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005603
Alex Elder30d60ba2013-05-06 09:51:30 -05005604 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005605 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005606 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005607
Alex Elder6fd48b32013-04-28 23:32:34 -05005608err_out_probe:
5609 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005610err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005611 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005612 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005613err_out_format:
5614 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005615 kfree(rbd_dev->spec->image_id);
5616 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005617 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005618}
5619
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005620static ssize_t do_rbd_add(struct bus_type *bus,
5621 const char *buf,
5622 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005623{
Alex Eldercb8627c2012-07-09 21:04:23 -05005624 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005625 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005626 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005627 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005628 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005629 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005630
5631 if (!try_module_get(THIS_MODULE))
5632 return -ENODEV;
5633
Alex Eldera725f65e2012-02-02 08:13:30 -06005634 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005635 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005636 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005637 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06005638
Alex Elder9d3997f2012-10-25 23:34:42 -05005639 rbdc = rbd_get_client(ceph_opts);
5640 if (IS_ERR(rbdc)) {
5641 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005642 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005643 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005644
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005645 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01005646 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005647 if (rc < 0) {
5648 if (rc == -ENOENT)
5649 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005650 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005651 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05005652 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005653
Ilya Dryomovd1475432015-06-22 13:24:48 +03005654 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005655 if (!rbd_dev) {
5656 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05005657 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005658 }
Alex Elderc53d5892012-10-25 23:34:42 -05005659 rbdc = NULL; /* rbd_dev now owns this */
5660 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03005661 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005662
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005663 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5664 if (!rbd_dev->config_info) {
5665 rc = -ENOMEM;
5666 goto err_out_rbd_dev;
5667 }
5668
Ilya Dryomov811c6682016-04-15 16:22:16 +02005669 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005670 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005671 if (rc < 0) {
5672 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05005673 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005674 }
Alex Elder05fd6f62012-08-29 17:11:07 -05005675
Alex Elder7ce4eef2013-05-06 17:40:33 -05005676 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05005677 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02005678 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05005679
Alex Elderb536f692013-04-28 23:32:34 -05005680 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005681 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005682 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05005683
Ilya Dryomove010dd02017-04-13 12:17:39 +02005684 if (rbd_dev->opts->exclusive) {
5685 rc = rbd_add_acquire_lock(rbd_dev);
5686 if (rc)
5687 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05005688 }
5689
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005690 /* Everything's ready. Announce the disk to the world. */
5691
5692 rc = device_add(&rbd_dev->dev);
5693 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02005694 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005695
5696 add_disk(rbd_dev->disk);
5697 /* see rbd_init_disk() */
5698 blk_put_queue(rbd_dev->disk->queue);
5699
5700 spin_lock(&rbd_dev_list_lock);
5701 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5702 spin_unlock(&rbd_dev_list_lock);
5703
5704 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5705 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5706 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005707 rc = count;
5708out:
5709 module_put(THIS_MODULE);
5710 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05005711
Ilya Dryomove010dd02017-04-13 12:17:39 +02005712err_out_image_lock:
5713 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005714err_out_device_setup:
5715 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005716err_out_image_probe:
5717 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005718err_out_rbd_dev:
5719 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005720err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005721 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005722err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05005723 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03005724 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005725 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005726}
5727
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005728static ssize_t rbd_add(struct bus_type *bus,
5729 const char *buf,
5730 size_t count)
5731{
5732 if (single_major)
5733 return -EINVAL;
5734
5735 return do_rbd_add(bus, buf, count);
5736}
5737
5738static ssize_t rbd_add_single_major(struct bus_type *bus,
5739 const char *buf,
5740 size_t count)
5741{
5742 return do_rbd_add(bus, buf, count);
5743}
5744
Alex Elder05a46af2013-04-26 15:44:36 -05005745static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5746{
Alex Elderad945fc2013-04-26 15:44:36 -05005747 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005748 struct rbd_device *first = rbd_dev;
5749 struct rbd_device *second = first->parent;
5750 struct rbd_device *third;
5751
5752 /*
5753 * Follow to the parent with no grandparent and
5754 * remove it.
5755 */
5756 while (second && (third = second->parent)) {
5757 first = second;
5758 second = third;
5759 }
Alex Elderad945fc2013-04-26 15:44:36 -05005760 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005761 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005762 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005763 first->parent = NULL;
5764 first->parent_overlap = 0;
5765
5766 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005767 rbd_spec_put(first->parent_spec);
5768 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005769 }
5770}
5771
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005772static ssize_t do_rbd_remove(struct bus_type *bus,
5773 const char *buf,
5774 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005775{
5776 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05005777 struct list_head *tmp;
5778 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02005779 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05005780 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02005781 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05005782 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005783
Mike Christie0276dca2016-08-18 18:38:45 +02005784 dev_id = -1;
5785 opt_buf[0] = '\0';
5786 sscanf(buf, "%d %5s", &dev_id, opt_buf);
5787 if (dev_id < 0) {
5788 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005789 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02005790 }
5791 if (opt_buf[0] != '\0') {
5792 if (!strcmp(opt_buf, "force")) {
5793 force = true;
5794 } else {
5795 pr_err("bad remove option at '%s'\n", opt_buf);
5796 return -EINVAL;
5797 }
5798 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005799
Alex Elder751cc0e2013-05-31 15:17:01 -05005800 ret = -ENOENT;
5801 spin_lock(&rbd_dev_list_lock);
5802 list_for_each(tmp, &rbd_dev_list) {
5803 rbd_dev = list_entry(tmp, struct rbd_device, node);
5804 if (rbd_dev->dev_id == dev_id) {
5805 ret = 0;
5806 break;
5807 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005808 }
Alex Elder751cc0e2013-05-31 15:17:01 -05005809 if (!ret) {
5810 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02005811 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05005812 ret = -EBUSY;
5813 else
Alex Elder82a442d2013-05-31 17:40:44 -05005814 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5815 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05005816 spin_unlock_irq(&rbd_dev->lock);
5817 }
5818 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05005819 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005820 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05005821
Mike Christie0276dca2016-08-18 18:38:45 +02005822 if (force) {
5823 /*
5824 * Prevent new IO from being queued and wait for existing
5825 * IO to complete/fail.
5826 */
5827 blk_mq_freeze_queue(rbd_dev->disk->queue);
5828 blk_set_queue_dying(rbd_dev->disk->queue);
5829 }
5830
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005831 del_gendisk(rbd_dev->disk);
5832 spin_lock(&rbd_dev_list_lock);
5833 list_del_init(&rbd_dev->node);
5834 spin_unlock(&rbd_dev_list_lock);
5835 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02005836
Ilya Dryomove010dd02017-04-13 12:17:39 +02005837 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005838 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005839 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005840 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005841 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005842}
5843
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005844static ssize_t rbd_remove(struct bus_type *bus,
5845 const char *buf,
5846 size_t count)
5847{
5848 if (single_major)
5849 return -EINVAL;
5850
5851 return do_rbd_remove(bus, buf, count);
5852}
5853
5854static ssize_t rbd_remove_single_major(struct bus_type *bus,
5855 const char *buf,
5856 size_t count)
5857{
5858 return do_rbd_remove(bus, buf, count);
5859}
5860
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005861/*
5862 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005863 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005864 */
5865static int rbd_sysfs_init(void)
5866{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005867 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005868
Alex Elderfed4c142012-02-07 12:03:36 -06005869 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005870 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005871 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005872
Alex Elderfed4c142012-02-07 12:03:36 -06005873 ret = bus_register(&rbd_bus_type);
5874 if (ret < 0)
5875 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005876
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005877 return ret;
5878}
5879
5880static void rbd_sysfs_cleanup(void)
5881{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005882 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005883 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005884}
5885
Alex Elder1c2a9df2013-05-01 12:43:03 -05005886static int rbd_slab_init(void)
5887{
5888 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005889 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05005890 if (!rbd_img_request_cache)
5891 return -ENOMEM;
5892
5893 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005894 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05005895 if (!rbd_obj_request_cache)
5896 goto out_err;
5897
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005898 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005899
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005900out_err:
Alex Elder868311b2013-05-01 12:43:03 -05005901 kmem_cache_destroy(rbd_img_request_cache);
5902 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005903 return -ENOMEM;
5904}
5905
5906static void rbd_slab_exit(void)
5907{
Alex Elder868311b2013-05-01 12:43:03 -05005908 rbd_assert(rbd_obj_request_cache);
5909 kmem_cache_destroy(rbd_obj_request_cache);
5910 rbd_obj_request_cache = NULL;
5911
Alex Elder1c2a9df2013-05-01 12:43:03 -05005912 rbd_assert(rbd_img_request_cache);
5913 kmem_cache_destroy(rbd_img_request_cache);
5914 rbd_img_request_cache = NULL;
5915}
5916
Alex Eldercc344fa2013-02-19 12:25:56 -06005917static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005918{
5919 int rc;
5920
Alex Elder1e32d342013-01-30 11:13:33 -06005921 if (!libceph_compatible(NULL)) {
5922 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06005923 return -EINVAL;
5924 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005925
Alex Elder1c2a9df2013-05-01 12:43:03 -05005926 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005927 if (rc)
5928 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005929
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005930 /*
5931 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03005932 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005933 */
5934 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
5935 if (!rbd_wq) {
5936 rc = -ENOMEM;
5937 goto err_out_slab;
5938 }
5939
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005940 if (single_major) {
5941 rbd_major = register_blkdev(0, RBD_DRV_NAME);
5942 if (rbd_major < 0) {
5943 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005944 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005945 }
5946 }
5947
Alex Elder1c2a9df2013-05-01 12:43:03 -05005948 rc = rbd_sysfs_init();
5949 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005950 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005951
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005952 if (single_major)
5953 pr_info("loaded (major %d)\n", rbd_major);
5954 else
5955 pr_info("loaded\n");
5956
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005957 return 0;
5958
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005959err_out_blkdev:
5960 if (single_major)
5961 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005962err_out_wq:
5963 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005964err_out_slab:
5965 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05005966 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005967}
5968
Alex Eldercc344fa2013-02-19 12:25:56 -06005969static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005970{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04005971 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005972 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005973 if (single_major)
5974 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005975 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05005976 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005977}
5978
5979module_init(rbd_init);
5980module_exit(rbd_exit);
5981
Alex Elderd552c612013-05-31 20:13:09 -05005982MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005983MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5984MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005985/* following authorship retained from original osdblk.c */
5986MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5987
Ilya Dryomov90da2582013-12-13 15:28:56 +02005988MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005989MODULE_LICENSE("GPL");