blob: 056865cfc596539746fdeec8839fb63813beb262 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
55 * The basic unit of block I/O is a sector. It is interpreted in a
56 * number of contexts in Linux (blk, bio, genhd), but the default is
57 * universally 512 bytes. These symbols are just slightly more
58 * meaningful than the bare numbers they represent.
59 */
60#define SECTOR_SHIFT 9
61#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
62
Alex Eldera2acd002013-05-08 22:50:04 -050063/*
64 * Increment the given counter and return its updated value.
65 * If the counter is already 0 it will not be incremented.
66 * If the counter is already at its maximum value returns
67 * -EINVAL without updating it.
68 */
69static int atomic_inc_return_safe(atomic_t *v)
70{
71 unsigned int counter;
72
73 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
74 if (counter <= (unsigned int)INT_MAX)
75 return (int)counter;
76
77 atomic_dec(v);
78
79 return -EINVAL;
80}
81
82/* Decrement the counter. Return the resulting value, or -EINVAL */
83static int atomic_dec_return_safe(atomic_t *v)
84{
85 int counter;
86
87 counter = atomic_dec_return(v);
88 if (counter >= 0)
89 return counter;
90
91 atomic_inc(v);
92
93 return -EINVAL;
94}
95
Alex Elderf0f8cef2012-01-29 13:57:44 -060096#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
Ilya Dryomov7e513d42013-12-16 19:26:32 +020098#define RBD_MINORS_PER_MAJOR 256
99#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700100
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200101#define RBD_MAX_PARENT_CHAIN_LEN 16
102
Alex Elderd4b125e2012-07-03 16:01:19 -0500103#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
104#define RBD_MAX_SNAP_NAME_LEN \
105 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
106
Alex Elder35d489f2012-07-03 16:01:19 -0500107#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
109#define RBD_SNAP_HEAD_NAME "-"
110
Alex Elder9682fc62013-04-30 00:44:33 -0500111#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
112
Alex Elder9e15b772012-10-30 19:40:33 -0500113/* This allows a single page to hold an image name sent by OSD */
114#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500115#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500116
Alex Elder1e130192012-07-03 16:01:19 -0500117#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500118
Ilya Dryomoved95b212016-08-12 16:40:02 +0200119#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200120#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
121
Alex Elderd8891402012-10-09 13:50:17 -0700122/* Feature bits */
123
Ilya Dryomov8767b292017-03-02 19:56:57 +0100124#define RBD_FEATURE_LAYERING (1ULL<<0)
125#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
126#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
127#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100128#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100129
Ilya Dryomoved95b212016-08-12 16:40:02 +0200130#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
131 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100132 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100133 RBD_FEATURE_DATA_POOL | \
134 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700135
136/* Features supported by this (client software) implementation. */
137
Alex Elder770eba62012-10-25 23:34:40 -0500138#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700139
Alex Elder81a89792012-02-02 08:13:30 -0600140/*
141 * An RBD device name will be "rbd#", where the "rbd" comes from
142 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600143 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700144#define DEV_NAME_LEN 32
145
146/*
147 * block device image metadata (in-memory version)
148 */
149struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500150 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500151 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700152 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500153 u64 stripe_unit;
154 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100155 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500156 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157
Alex Elderf84344f2012-08-31 17:29:51 -0500158 /* The remaining fields need to be updated occasionally */
159 u64 image_size;
160 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500161 char *snap_names; /* format 1 only */
162 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700163};
164
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500165/*
166 * An rbd image specification.
167 *
168 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500169 * identify an image. Each rbd_dev structure includes a pointer to
170 * an rbd_spec structure that encapsulates this identity.
171 *
172 * Each of the id's in an rbd_spec has an associated name. For a
173 * user-mapped image, the names are supplied and the id's associated
174 * with them are looked up. For a layered image, a parent image is
175 * defined by the tuple, and the names are looked up.
176 *
177 * An rbd_dev structure contains a parent_spec pointer which is
178 * non-null if the image it represents is a child in a layered
179 * image. This pointer will refer to the rbd_spec structure used
180 * by the parent rbd_dev for its own identity (i.e., the structure
181 * is shared between the parent and child).
182 *
183 * Since these structures are populated once, during the discovery
184 * phase of image construction, they are effectively immutable so
185 * we make no effort to synchronize access to them.
186 *
187 * Note that code herein does not assume the image name is known (it
188 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500189 */
190struct rbd_spec {
191 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500192 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500193
Alex Elderecb4dc22013-04-26 09:43:47 -0500194 const char *image_id;
195 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500196
197 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500198 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500199
200 struct kref kref;
201};
202
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700203/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600204 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700205 */
206struct rbd_client {
207 struct ceph_client *client;
208 struct kref kref;
209 struct list_head node;
210};
211
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600213
Alex Elder9969ebc2013-01-18 12:31:10 -0600214enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100215 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100216 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100217 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100218 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600219};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600220
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800221enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100222 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800223 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800224 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800225};
226
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100227/*
228 * Writes go through the following state machine to deal with
229 * layering:
230 *
231 * need copyup
232 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
233 * | ^ |
234 * v \------------------------------/
235 * done
236 * ^
237 * |
238 * RBD_OBJ_WRITE_FLAT
239 *
240 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
241 * there is a parent or not.
242 */
243enum rbd_obj_write_state {
244 RBD_OBJ_WRITE_FLAT = 1,
245 RBD_OBJ_WRITE_GUARD,
246 RBD_OBJ_WRITE_COPYUP,
247};
248
Alex Elderbf0d5f502012-11-22 00:00:08 -0600249struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100250 struct ceph_object_extent ex;
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100251 union {
252 bool tried_parent; /* for reads */
253 enum rbd_obj_write_state write_state; /* for writes */
254 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255
Ilya Dryomov51c35092018-01-29 14:04:08 +0100256 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100257 struct ceph_file_extent *img_extents;
258 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600259
Alex Elder788e2df2013-01-17 12:25:27 -0600260 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100261 struct ceph_bio_iter bio_pos;
Alex Elder788e2df2013-01-17 12:25:27 -0600262 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100263 struct ceph_bvec_iter bvec_pos;
264 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100265 u32 bvec_idx;
Alex Elder788e2df2013-01-17 12:25:27 -0600266 };
267 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100268 struct bio_vec *copyup_bvecs;
269 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600270
271 struct ceph_osd_request *osd_req;
272
273 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800274 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600275
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276 struct kref kref;
277};
278
Alex Elder0c425242013-02-08 09:55:49 -0600279enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600280 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600281 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600282};
283
Alex Elderbf0d5f502012-11-22 00:00:08 -0600284struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600285 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100286 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100287 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600288 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600289 union {
Alex Elder9849e982013-01-24 16:13:36 -0600290 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600291 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600292 };
293 union {
294 struct request *rq; /* block request */
295 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600296 };
Ilya Dryomov15961b42018-02-01 11:50:47 +0100297 spinlock_t completion_lock;
Alex Elder55f27e02013-04-10 12:34:25 -0500298 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600299 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600300
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100301 struct list_head object_extents; /* obj_req.ex structs */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600302 u32 obj_request_count;
Ilya Dryomov7114eda2018-02-01 11:50:47 +0100303 u32 pending_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600304
305 struct kref kref;
306};
307
308#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100309 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600310#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100311 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600312
Ilya Dryomov99d16942016-08-12 16:11:41 +0200313enum rbd_watch_state {
314 RBD_WATCH_STATE_UNREGISTERED,
315 RBD_WATCH_STATE_REGISTERED,
316 RBD_WATCH_STATE_ERROR,
317};
318
Ilya Dryomoved95b212016-08-12 16:40:02 +0200319enum rbd_lock_state {
320 RBD_LOCK_STATE_UNLOCKED,
321 RBD_LOCK_STATE_LOCKED,
322 RBD_LOCK_STATE_RELEASING,
323};
324
325/* WatchNotify::ClientId */
326struct rbd_client_id {
327 u64 gid;
328 u64 handle;
329};
330
Alex Elderf84344f2012-08-31 17:29:51 -0500331struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500332 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500333 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500334};
335
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336/*
337 * a single device
338 */
339struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500340 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700341
342 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200343 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700345
Alex Eldera30b71b2012-07-10 20:30:11 -0500346 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347 struct rbd_client *rbd_client;
348
349 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
350
Alex Elderb82d1672013-01-14 12:43:31 -0600351 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700352
353 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600354 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500355 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300356 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200357 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200359 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200360 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500361
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200362 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600363
Ilya Dryomov99d16942016-08-12 16:11:41 +0200364 struct mutex watch_mutex;
365 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200366 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200367 u64 watch_cookie;
368 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700369
Ilya Dryomoved95b212016-08-12 16:40:02 +0200370 struct rw_semaphore lock_rwsem;
371 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200372 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200373 struct rbd_client_id owner_cid;
374 struct work_struct acquired_lock_work;
375 struct work_struct released_lock_work;
376 struct delayed_work lock_dwork;
377 struct work_struct unlock_work;
378 wait_queue_head_t lock_waitq;
379
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200380 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700381
Alex Elder86b00e02012-10-25 23:34:42 -0500382 struct rbd_spec *parent_spec;
383 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500384 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500385 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500386
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100387 /* Block layer tags. */
388 struct blk_mq_tag_set tag_set;
389
Josh Durginc6666012011-11-21 17:11:12 -0800390 /* protects updating the header */
391 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500392
393 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394
395 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800396
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800397 /* sysfs related */
398 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600399 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800400};
401
Alex Elderb82d1672013-01-14 12:43:31 -0600402/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200403 * Flag bits for rbd_dev->flags:
404 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
405 * by rbd_dev->lock
406 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600407 */
Alex Elder6d292902013-01-14 12:43:31 -0600408enum rbd_dev_flags {
409 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600410 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200411 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600412};
413
Alex Eldercfbf6372013-05-31 17:40:45 -0500414static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600415
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700416static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600417static DEFINE_SPINLOCK(rbd_dev_list_lock);
418
Alex Elder432b8582012-01-29 13:57:44 -0600419static LIST_HEAD(rbd_client_list); /* clients */
420static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421
Alex Elder78c2a442013-05-01 12:43:04 -0500422/* Slab caches for frequently-allocated structures */
423
Alex Elder1c2a9df2013-05-01 12:43:03 -0500424static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500425static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500426
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200427static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200428static DEFINE_IDA(rbd_dev_id_ida);
429
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400430static struct workqueue_struct *rbd_wq;
431
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200432/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100433 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200434 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100435static bool single_major = true;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200436module_param(single_major, bool, S_IRUGO);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100437MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200438
Alex Elderf0f8cef2012-01-29 13:57:44 -0600439static ssize_t rbd_add(struct bus_type *bus, const char *buf,
440 size_t count);
441static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
442 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200443static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
444 size_t count);
445static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
446 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200447static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Eldera2acd002013-05-08 22:50:04 -0500448static void rbd_spec_put(struct rbd_spec *spec);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600449
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200450static int rbd_dev_id_to_minor(int dev_id)
451{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200452 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200453}
454
455static int minor_to_rbd_dev_id(int minor)
456{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200457 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200458}
459
Ilya Dryomoved95b212016-08-12 16:40:02 +0200460static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
461{
462 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
463 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
464}
465
466static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
467{
468 bool is_lock_owner;
469
470 down_read(&rbd_dev->lock_rwsem);
471 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
472 up_read(&rbd_dev->lock_rwsem);
473 return is_lock_owner;
474}
475
Ilya Dryomov8767b292017-03-02 19:56:57 +0100476static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
477{
478 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
479}
480
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700481static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
482static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200483static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
484static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Ilya Dryomov8767b292017-03-02 19:56:57 +0100485static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700486
487static struct attribute *rbd_bus_attrs[] = {
488 &bus_attr_add.attr,
489 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200490 &bus_attr_add_single_major.attr,
491 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100492 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700493 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600494};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200495
496static umode_t rbd_bus_is_visible(struct kobject *kobj,
497 struct attribute *attr, int index)
498{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200499 if (!single_major &&
500 (attr == &bus_attr_add_single_major.attr ||
501 attr == &bus_attr_remove_single_major.attr))
502 return 0;
503
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200504 return attr->mode;
505}
506
507static const struct attribute_group rbd_bus_group = {
508 .attrs = rbd_bus_attrs,
509 .is_visible = rbd_bus_is_visible,
510};
511__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600512
513static struct bus_type rbd_bus_type = {
514 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700515 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600516};
517
518static void rbd_root_dev_release(struct device *dev)
519{
520}
521
522static struct device rbd_root_dev = {
523 .init_name = "rbd",
524 .release = rbd_root_dev_release,
525};
526
Alex Elder06ecc6c2012-11-01 10:17:15 -0500527static __printf(2, 3)
528void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
529{
530 struct va_format vaf;
531 va_list args;
532
533 va_start(args, fmt);
534 vaf.fmt = fmt;
535 vaf.va = &args;
536
537 if (!rbd_dev)
538 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
539 else if (rbd_dev->disk)
540 printk(KERN_WARNING "%s: %s: %pV\n",
541 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
542 else if (rbd_dev->spec && rbd_dev->spec->image_name)
543 printk(KERN_WARNING "%s: image %s: %pV\n",
544 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
545 else if (rbd_dev->spec && rbd_dev->spec->image_id)
546 printk(KERN_WARNING "%s: id %s: %pV\n",
547 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
548 else /* punt */
549 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
550 RBD_DRV_NAME, rbd_dev, &vaf);
551 va_end(args);
552}
553
Alex Elderaafb2302012-09-06 16:00:54 -0500554#ifdef RBD_DEBUG
555#define rbd_assert(expr) \
556 if (unlikely(!(expr))) { \
557 printk(KERN_ERR "\nAssertion failure in %s() " \
558 "at line %d:\n\n" \
559 "\trbd_assert(%s);\n\n", \
560 __func__, __LINE__, #expr); \
561 BUG(); \
562 }
563#else /* !RBD_DEBUG */
564# define rbd_assert(expr) ((void) 0)
565#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800566
Alex Elder05a46af2013-04-26 15:44:36 -0500567static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600568
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500569static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500570static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400571static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400572static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500573static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
574 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500575static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
576 u8 *order, u64 *snap_size);
577static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
578 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700579
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700580static int rbd_open(struct block_device *bdev, fmode_t mode)
581{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600582 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600583 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584
Alex Eldera14ea262013-02-05 13:23:12 -0600585 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600586 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
587 removing = true;
588 else
589 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600590 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600591 if (removing)
592 return -ENOENT;
593
Alex Elderc3e946c2012-11-16 09:29:16 -0600594 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700595
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700596 return 0;
597}
598
Al Virodb2a1442013-05-05 21:52:57 -0400599static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800600{
601 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600602 unsigned long open_count_before;
603
Alex Eldera14ea262013-02-05 13:23:12 -0600604 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600605 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600606 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600607 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800608
Alex Elderc3e946c2012-11-16 09:29:16 -0600609 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800610}
611
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800612static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
613{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200614 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800615
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200616 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800617 return -EFAULT;
618
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200619 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800620 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
621 return -EROFS;
622
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200623 /* Let blkdev_roset() handle it */
624 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800625}
626
627static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
628 unsigned int cmd, unsigned long arg)
629{
630 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200631 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800632
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800633 switch (cmd) {
634 case BLKROSET:
635 ret = rbd_ioctl_set_ro(rbd_dev, arg);
636 break;
637 default:
638 ret = -ENOTTY;
639 }
640
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800641 return ret;
642}
643
644#ifdef CONFIG_COMPAT
645static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
646 unsigned int cmd, unsigned long arg)
647{
648 return rbd_ioctl(bdev, mode, cmd, arg);
649}
650#endif /* CONFIG_COMPAT */
651
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652static const struct block_device_operations rbd_bd_ops = {
653 .owner = THIS_MODULE,
654 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800655 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800656 .ioctl = rbd_ioctl,
657#ifdef CONFIG_COMPAT
658 .compat_ioctl = rbd_compat_ioctl,
659#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660};
661
662/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500663 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500664 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 */
Alex Elderf8c38922012-08-10 13:12:07 -0700666static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667{
668 struct rbd_client *rbdc;
669 int ret = -ENOMEM;
670
Alex Elder37206ee2013-02-20 17:32:08 -0600671 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700672 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
673 if (!rbdc)
674 goto out_opt;
675
676 kref_init(&rbdc->kref);
677 INIT_LIST_HEAD(&rbdc->node);
678
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100679 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500681 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500682 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683
684 ret = ceph_open_session(rbdc->client);
685 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500686 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687
Alex Elder432b8582012-01-29 13:57:44 -0600688 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600690 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700691
Alex Elder37206ee2013-02-20 17:32:08 -0600692 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600693
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700694 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500695out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500697out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700698 kfree(rbdc);
699out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500700 if (ceph_opts)
701 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600702 dout("%s: error %d\n", __func__, ret);
703
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400704 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705}
706
Alex Elder2f82ee52012-10-30 19:40:33 -0500707static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
708{
709 kref_get(&rbdc->kref);
710
711 return rbdc;
712}
713
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700714/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700715 * Find a ceph client with specific addr and configuration. If
716 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700717 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700718static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719{
720 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700721 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700722
Alex Elder43ae4702012-07-03 16:01:18 -0500723 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700724 return NULL;
725
Alex Elder1f7ba332012-08-10 13:12:07 -0700726 spin_lock(&rbd_client_list_lock);
727 list_for_each_entry(client_node, &rbd_client_list, node) {
728 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500729 __rbd_get_client(client_node);
730
Alex Elder1f7ba332012-08-10 13:12:07 -0700731 found = true;
732 break;
733 }
734 }
735 spin_unlock(&rbd_client_list_lock);
736
737 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738}
739
740/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300741 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700742 */
743enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300744 Opt_queue_depth,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700745 Opt_last_int,
746 /* int args above */
747 Opt_last_string,
748 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700749 Opt_read_only,
750 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200751 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200752 Opt_exclusive,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300753 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700754};
755
Alex Elder43ae4702012-07-03 16:01:18 -0500756static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300757 {Opt_queue_depth, "queue_depth=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700758 /* int args above */
759 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500760 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700761 {Opt_read_only, "ro"}, /* Alternate spelling */
762 {Opt_read_write, "read_write"},
763 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200764 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200765 {Opt_exclusive, "exclusive"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300766 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700767};
768
Alex Elder98571b52013-01-20 14:44:42 -0600769struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300770 int queue_depth;
Alex Elder98571b52013-01-20 14:44:42 -0600771 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200772 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200773 bool exclusive;
Alex Elder98571b52013-01-20 14:44:42 -0600774};
775
Ilya Dryomovb5584182015-06-23 16:21:19 +0300776#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Alex Elder98571b52013-01-20 14:44:42 -0600777#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200778#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200779#define RBD_EXCLUSIVE_DEFAULT false
Alex Elder98571b52013-01-20 14:44:42 -0600780
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700781static int parse_rbd_opts_token(char *c, void *private)
782{
Alex Elder43ae4702012-07-03 16:01:18 -0500783 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700784 substring_t argstr[MAX_OPT_ARGS];
785 int token, intval, ret;
786
Alex Elder43ae4702012-07-03 16:01:18 -0500787 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700788 if (token < Opt_last_int) {
789 ret = match_int(&argstr[0], &intval);
790 if (ret < 0) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300791 pr_err("bad mount option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700792 return ret;
793 }
794 dout("got int token %d val %d\n", token, intval);
795 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300796 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700797 } else {
798 dout("got token %d\n", token);
799 }
800
801 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300802 case Opt_queue_depth:
803 if (intval < 1) {
804 pr_err("queue_depth out of range\n");
805 return -EINVAL;
806 }
807 rbd_opts->queue_depth = intval;
808 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700809 case Opt_read_only:
810 rbd_opts->read_only = true;
811 break;
812 case Opt_read_write:
813 rbd_opts->read_only = false;
814 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200815 case Opt_lock_on_read:
816 rbd_opts->lock_on_read = true;
817 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200818 case Opt_exclusive:
819 rbd_opts->exclusive = true;
820 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700821 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300822 /* libceph prints "bad option" msg */
823 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700824 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300825
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700826 return 0;
827}
828
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800829static char* obj_op_name(enum obj_operation_type op_type)
830{
831 switch (op_type) {
832 case OBJ_OP_READ:
833 return "read";
834 case OBJ_OP_WRITE:
835 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800836 case OBJ_OP_DISCARD:
837 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800838 default:
839 return "???";
840 }
841}
842
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700843/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700844 * Get a ceph client with specific addr and configuration, if one does
Alex Elder7262cfc2013-05-16 15:04:20 -0500845 * not exist create it. Either way, ceph_opts is consumed by this
846 * function.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700847 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500848static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700849{
Alex Elderf8c38922012-08-10 13:12:07 -0700850 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700851
Alex Eldercfbf6372013-05-31 17:40:45 -0500852 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
Alex Elder1f7ba332012-08-10 13:12:07 -0700853 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500854 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500855 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500856 else
Alex Elderf8c38922012-08-10 13:12:07 -0700857 rbdc = rbd_client_create(ceph_opts);
Alex Eldercfbf6372013-05-31 17:40:45 -0500858 mutex_unlock(&client_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859
Alex Elder9d3997f2012-10-25 23:34:42 -0500860 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700861}
862
863/*
864 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600865 *
Alex Elder432b8582012-01-29 13:57:44 -0600866 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700867 */
868static void rbd_client_release(struct kref *kref)
869{
870 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
871
Alex Elder37206ee2013-02-20 17:32:08 -0600872 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500873 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500875 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876
877 ceph_destroy_client(rbdc->client);
878 kfree(rbdc);
879}
880
881/*
882 * Drop reference to ceph client node. If it's not referenced anymore, release
883 * it.
884 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500885static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700886{
Alex Elderc53d5892012-10-25 23:34:42 -0500887 if (rbdc)
888 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889}
890
Alex Eldera30b71b2012-07-10 20:30:11 -0500891static bool rbd_image_format_valid(u32 image_format)
892{
893 return image_format == 1 || image_format == 2;
894}
895
Alex Elder8e94af82012-07-25 09:32:40 -0500896static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
897{
Alex Elder103a1502012-08-02 11:29:45 -0500898 size_t size;
899 u32 snap_count;
900
901 /* The header has to start with the magic rbd header text */
902 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
903 return false;
904
Alex Elderdb2388b2012-10-20 22:17:27 -0500905 /* The bio layer requires at least sector-sized I/O */
906
907 if (ondisk->options.order < SECTOR_SHIFT)
908 return false;
909
910 /* If we use u64 in a few spots we may be able to loosen this */
911
912 if (ondisk->options.order > 8 * sizeof (int) - 1)
913 return false;
914
Alex Elder103a1502012-08-02 11:29:45 -0500915 /*
916 * The size of a snapshot header has to fit in a size_t, and
917 * that limits the number of snapshots.
918 */
919 snap_count = le32_to_cpu(ondisk->snap_count);
920 size = SIZE_MAX - sizeof (struct ceph_snap_context);
921 if (snap_count > size / sizeof (__le64))
922 return false;
923
924 /*
925 * Not only that, but the size of the entire the snapshot
926 * header must also be representable in a size_t.
927 */
928 size -= snap_count * sizeof (__le64);
929 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
930 return false;
931
932 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500933}
934
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700935/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100936 * returns the size of an object in the image
937 */
938static u32 rbd_obj_bytes(struct rbd_image_header *header)
939{
940 return 1U << header->obj_order;
941}
942
Ilya Dryomov263423f2017-01-25 18:16:22 +0100943static void rbd_init_layout(struct rbd_device *rbd_dev)
944{
945 if (rbd_dev->header.stripe_unit == 0 ||
946 rbd_dev->header.stripe_count == 0) {
947 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
948 rbd_dev->header.stripe_count = 1;
949 }
950
951 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
952 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
953 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +0100954 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
955 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +0100956 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
957}
958
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100959/*
Alex Elderbb23e372013-05-06 09:51:29 -0500960 * Fill an rbd image header with information from the given format 1
961 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962 */
Alex Elder662518b2013-05-06 09:51:29 -0500963static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -0500964 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700965{
Alex Elder662518b2013-05-06 09:51:29 -0500966 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -0500967 bool first_time = header->object_prefix == NULL;
968 struct ceph_snap_context *snapc;
969 char *object_prefix = NULL;
970 char *snap_names = NULL;
971 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500972 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -0500973 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -0500974 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Alex Elderbb23e372013-05-06 09:51:29 -0500976 /* Allocate this now to avoid having to handle failure below */
977
978 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +0100979 object_prefix = kstrndup(ondisk->object_prefix,
980 sizeof(ondisk->object_prefix),
981 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -0500982 if (!object_prefix)
983 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -0500984 }
985
986 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -0500987
Alex Elder103a1502012-08-02 11:29:45 -0500988 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -0500989 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
990 if (!snapc)
991 goto out_err;
992 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700993 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -0500994 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -0500995 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
996
Alex Elderbb23e372013-05-06 09:51:29 -0500997 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -0500998
Alex Elderbb23e372013-05-06 09:51:29 -0500999 if (snap_names_len > (u64)SIZE_MAX)
1000 goto out_2big;
1001 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1002 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001003 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001004
1005 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001006 snap_sizes = kmalloc_array(snap_count,
1007 sizeof(*header->snap_sizes),
1008 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001009 if (!snap_sizes)
1010 goto out_err;
1011
Alex Elderf785cc12012-08-23 23:22:06 -05001012 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001013 * Copy the names, and fill in each snapshot's id
1014 * and size.
1015 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001016 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001017 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001018 * snap_names_len bytes beyond the end of the
1019 * snapshot id array, this memcpy() is safe.
1020 */
Alex Elderbb23e372013-05-06 09:51:29 -05001021 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1022 snaps = ondisk->snaps;
1023 for (i = 0; i < snap_count; i++) {
1024 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1025 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1026 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001027 }
Alex Elder849b4262012-07-09 21:04:24 -05001028
Alex Elderbb23e372013-05-06 09:51:29 -05001029 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001030
Alex Elderbb23e372013-05-06 09:51:29 -05001031 if (first_time) {
1032 header->object_prefix = object_prefix;
1033 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001034 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001035 } else {
1036 ceph_put_snap_context(header->snapc);
1037 kfree(header->snap_names);
1038 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001039 }
1040
1041 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001042
Alex Elderf84344f2012-08-31 17:29:51 -05001043 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001044 header->snapc = snapc;
1045 header->snap_names = snap_names;
1046 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001047
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001048 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001049out_2big:
1050 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001051out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001052 kfree(snap_sizes);
1053 kfree(snap_names);
1054 ceph_put_snap_context(snapc);
1055 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001056
Alex Elderbb23e372013-05-06 09:51:29 -05001057 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058}
1059
Alex Elder9682fc62013-04-30 00:44:33 -05001060static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1061{
1062 const char *snap_name;
1063
1064 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1065
1066 /* Skip over names until we find the one we are looking for */
1067
1068 snap_name = rbd_dev->header.snap_names;
1069 while (which--)
1070 snap_name += strlen(snap_name) + 1;
1071
1072 return kstrdup(snap_name, GFP_KERNEL);
1073}
1074
Alex Elder30d1cff2013-05-01 12:43:03 -05001075/*
1076 * Snapshot id comparison function for use with qsort()/bsearch().
1077 * Note that result is for snapshots in *descending* order.
1078 */
1079static int snapid_compare_reverse(const void *s1, const void *s2)
1080{
1081 u64 snap_id1 = *(u64 *)s1;
1082 u64 snap_id2 = *(u64 *)s2;
1083
1084 if (snap_id1 < snap_id2)
1085 return 1;
1086 return snap_id1 == snap_id2 ? 0 : -1;
1087}
1088
1089/*
1090 * Search a snapshot context to see if the given snapshot id is
1091 * present.
1092 *
1093 * Returns the position of the snapshot id in the array if it's found,
1094 * or BAD_SNAP_INDEX otherwise.
1095 *
1096 * Note: The snapshot array is in kept sorted (by the osd) in
1097 * reverse order, highest snapshot id first.
1098 */
Alex Elder9682fc62013-04-30 00:44:33 -05001099static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1100{
1101 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001102 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001103
Alex Elder30d1cff2013-05-01 12:43:03 -05001104 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1105 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001106
Alex Elder30d1cff2013-05-01 12:43:03 -05001107 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001108}
1109
Alex Elder2ad3d712013-04-30 00:44:33 -05001110static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1111 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001112{
1113 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001114 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001115
1116 which = rbd_dev_snap_index(rbd_dev, snap_id);
1117 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001118 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001119
Josh Durginda6a6b62013-09-04 17:57:31 -07001120 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1121 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001122}
1123
Alex Elder9e15b772012-10-30 19:40:33 -05001124static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1125{
Alex Elder9e15b772012-10-30 19:40:33 -05001126 if (snap_id == CEPH_NOSNAP)
1127 return RBD_SNAP_HEAD_NAME;
1128
Alex Elder54cac612013-04-30 00:44:33 -05001129 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1130 if (rbd_dev->image_format == 1)
1131 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001132
Alex Elder54cac612013-04-30 00:44:33 -05001133 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001134}
1135
Alex Elder2ad3d712013-04-30 00:44:33 -05001136static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1137 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001138{
Alex Elder2ad3d712013-04-30 00:44:33 -05001139 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1140 if (snap_id == CEPH_NOSNAP) {
1141 *snap_size = rbd_dev->header.image_size;
1142 } else if (rbd_dev->image_format == 1) {
1143 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001144
Alex Elder2ad3d712013-04-30 00:44:33 -05001145 which = rbd_dev_snap_index(rbd_dev, snap_id);
1146 if (which == BAD_SNAP_INDEX)
1147 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001148
Alex Elder2ad3d712013-04-30 00:44:33 -05001149 *snap_size = rbd_dev->header.snap_sizes[which];
1150 } else {
1151 u64 size = 0;
1152 int ret;
1153
1154 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1155 if (ret)
1156 return ret;
1157
1158 *snap_size = size;
1159 }
1160 return 0;
1161}
1162
1163static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1164 u64 *snap_features)
1165{
1166 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1167 if (snap_id == CEPH_NOSNAP) {
1168 *snap_features = rbd_dev->header.features;
1169 } else if (rbd_dev->image_format == 1) {
1170 *snap_features = 0; /* No features for format 1 */
1171 } else {
1172 u64 features = 0;
1173 int ret;
1174
1175 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1176 if (ret)
1177 return ret;
1178
1179 *snap_features = features;
1180 }
1181 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001182}
1183
Alex Elderd1cf5782013-04-27 09:59:30 -05001184static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001185{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001186 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001187 u64 size = 0;
1188 u64 features = 0;
1189 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001190
Alex Elder2ad3d712013-04-30 00:44:33 -05001191 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1192 if (ret)
1193 return ret;
1194 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1195 if (ret)
1196 return ret;
1197
1198 rbd_dev->mapping.size = size;
1199 rbd_dev->mapping.features = features;
1200
Alex Elder8b0241f2013-04-25 23:15:08 -05001201 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001202}
1203
Alex Elderd1cf5782013-04-27 09:59:30 -05001204static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1205{
1206 rbd_dev->mapping.size = 0;
1207 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001208}
1209
Ilya Dryomov5359a172018-01-20 10:30:10 +01001210static void zero_bvec(struct bio_vec *bv)
1211{
1212 void *buf;
1213 unsigned long flags;
1214
1215 buf = bvec_kmap_irq(bv, &flags);
1216 memset(buf, 0, bv->bv_len);
1217 flush_dcache_page(bv->bv_page);
1218 bvec_kunmap_irq(buf, &flags);
1219}
1220
1221static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1222{
1223 struct ceph_bio_iter it = *bio_pos;
1224
1225 ceph_bio_iter_advance(&it, off);
1226 ceph_bio_iter_advance_step(&it, bytes, ({
1227 zero_bvec(&bv);
1228 }));
1229}
1230
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001231static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001232{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001233 struct ceph_bvec_iter it = *bvec_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001234
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001235 ceph_bvec_iter_advance(&it, off);
1236 ceph_bvec_iter_advance_step(&it, bytes, ({
1237 zero_bvec(&bv);
1238 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001239}
1240
1241/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001242 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001243 * (private) bio_vec array.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001244 *
1245 * @off is relative to the start of the data buffer.
1246 */
1247static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1248 u32 bytes)
1249{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001250 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001251 case OBJ_REQUEST_BIO:
1252 zero_bios(&obj_req->bio_pos, off, bytes);
1253 break;
1254 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001255 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001256 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1257 break;
1258 default:
1259 rbd_assert(0);
1260 }
1261}
1262
Alex Elderbf0d5f502012-11-22 00:00:08 -06001263static void rbd_obj_request_destroy(struct kref *kref);
1264static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1265{
1266 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001267 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001268 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001269 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1270}
1271
Alex Elder0f2d5be2014-04-26 14:21:44 +04001272static void rbd_img_request_get(struct rbd_img_request *img_request)
1273{
1274 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001275 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001276 kref_get(&img_request->kref);
1277}
1278
Alex Elderbf0d5f502012-11-22 00:00:08 -06001279static void rbd_img_request_destroy(struct kref *kref);
1280static void rbd_img_request_put(struct rbd_img_request *img_request)
1281{
1282 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001283 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001284 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001285 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001286}
1287
1288static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1289 struct rbd_obj_request *obj_request)
1290{
Alex Elder25dcf952013-01-25 17:08:55 -06001291 rbd_assert(obj_request->img_request == NULL);
1292
Alex Elderb155e862013-04-15 14:50:37 -05001293 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001294 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001295 img_request->obj_request_count++;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01001296 img_request->pending_count++;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001297 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001298}
1299
1300static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1301 struct rbd_obj_request *obj_request)
1302{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001303 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001304 list_del(&obj_request->ex.oe_item);
Alex Elder25dcf952013-01-25 17:08:55 -06001305 rbd_assert(img_request->obj_request_count > 0);
1306 img_request->obj_request_count--;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001307 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001308 rbd_obj_request_put(obj_request);
1309}
1310
Ilya Dryomov980917f2016-09-12 18:59:42 +02001311static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001312{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001313 struct ceph_osd_request *osd_req = obj_request->osd_req;
1314
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001315 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001316 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1317 obj_request->ex.oe_len, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001318 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001319}
1320
Alex Elder0c425242013-02-08 09:55:49 -06001321/*
1322 * The default/initial value for all image request flags is 0. Each
1323 * is conditionally set to 1 at image request initialization time
1324 * and currently never change thereafter.
1325 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001326static void img_request_layered_set(struct rbd_img_request *img_request)
1327{
1328 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1329 smp_mb();
1330}
1331
Alex Eldera2acd002013-05-08 22:50:04 -05001332static void img_request_layered_clear(struct rbd_img_request *img_request)
1333{
1334 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1335 smp_mb();
1336}
1337
Alex Elderd0b2e942013-01-24 16:13:36 -06001338static bool img_request_layered_test(struct rbd_img_request *img_request)
1339{
1340 smp_mb();
1341 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1342}
1343
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001344static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1345{
1346 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1347
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001348 return !obj_req->ex.oe_off &&
1349 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001350}
1351
1352static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1353{
1354 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1355
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001356 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001357 rbd_dev->layout.object_size;
1358}
1359
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001360static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1361{
1362 return ceph_file_extents_bytes(obj_req->img_extents,
1363 obj_req->num_img_extents);
1364}
1365
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001366static bool rbd_img_is_write(struct rbd_img_request *img_req)
1367{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001368 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001369 case OBJ_OP_READ:
1370 return false;
1371 case OBJ_OP_WRITE:
1372 case OBJ_OP_DISCARD:
1373 return true;
1374 default:
1375 rbd_assert(0);
1376 }
1377}
1378
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001379static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
1380
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001381static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001382{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001383 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001384
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001385 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1386 osd_req->r_result, obj_req);
1387 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001388
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001389 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1390 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1391 obj_req->xferred = osd_req->r_result;
1392 else
1393 /*
1394 * Writes aren't allowed to return a data payload. In some
1395 * guarded write cases (e.g. stat + zero on an empty object)
1396 * a stat response makes it through, but we don't care.
1397 */
1398 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001399
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001400 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001401}
1402
Alex Elder9d4df012013-04-19 15:34:50 -05001403static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001404{
Alex Elder8c042b02013-04-03 01:28:58 -05001405 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001406
Ilya Dryomova162b302018-01-30 17:52:10 +01001407 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001408 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001409}
1410
1411static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1412{
Alex Elder9d4df012013-04-19 15:34:50 -05001413 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001414
Ilya Dryomova162b302018-01-30 17:52:10 +01001415 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Deepa Dinamani1134e092017-05-08 15:59:19 -07001416 ktime_get_real_ts(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001417 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001418}
1419
Ilya Dryomovbc812072017-01-25 18:16:23 +01001420static struct ceph_osd_request *
Ilya Dryomova162b302018-01-30 17:52:10 +01001421rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001422{
Ilya Dryomova162b302018-01-30 17:52:10 +01001423 struct rbd_img_request *img_req = obj_req->img_request;
1424 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001425 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1426 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001427 const char *name_format = rbd_dev->image_format == 1 ?
1428 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001429
Ilya Dryomova162b302018-01-30 17:52:10 +01001430 req = ceph_osdc_alloc_request(osdc,
1431 (rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1432 num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001433 if (!req)
1434 return NULL;
1435
Ilya Dryomovbc812072017-01-25 18:16:23 +01001436 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001437 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001438
1439 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001440 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001441 rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001442 goto err_req;
1443
1444 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1445 goto err_req;
1446
1447 return req;
1448
1449err_req:
1450 ceph_osdc_put_request(req);
1451 return NULL;
1452}
1453
Alex Elderbf0d5f502012-11-22 00:00:08 -06001454static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1455{
1456 ceph_osdc_put_request(osd_req);
1457}
1458
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001459static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001460{
1461 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001462
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001463 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001464 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001465 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001466
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001467 ceph_object_extent_init(&obj_request->ex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001468 kref_init(&obj_request->kref);
1469
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001470 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001471 return obj_request;
1472}
1473
1474static void rbd_obj_request_destroy(struct kref *kref)
1475{
1476 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001477 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001478
1479 obj_request = container_of(kref, struct rbd_obj_request, kref);
1480
Alex Elder37206ee2013-02-20 17:32:08 -06001481 dout("%s: obj %p\n", __func__, obj_request);
1482
Alex Elderbf0d5f502012-11-22 00:00:08 -06001483 if (obj_request->osd_req)
1484 rbd_osd_req_destroy(obj_request->osd_req);
1485
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001486 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001487 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001488 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001489 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001490 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001491 case OBJ_REQUEST_OWN_BVECS:
1492 kfree(obj_request->bvec_pos.bvecs);
1493 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001494 default:
1495 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001496 }
1497
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001498 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001499 if (obj_request->copyup_bvecs) {
1500 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1501 if (obj_request->copyup_bvecs[i].bv_page)
1502 __free_page(obj_request->copyup_bvecs[i].bv_page);
1503 }
1504 kfree(obj_request->copyup_bvecs);
1505 }
Ilya Dryomovf9dcbc42018-01-20 10:30:11 +01001506
Alex Elder868311b2013-05-01 12:43:03 -05001507 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001508}
1509
Alex Elderfb65d2282013-05-08 22:50:04 -05001510/* It's OK to call this for a device with no parent */
1511
1512static void rbd_spec_put(struct rbd_spec *spec);
1513static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1514{
1515 rbd_dev_remove_parent(rbd_dev);
1516 rbd_spec_put(rbd_dev->parent_spec);
1517 rbd_dev->parent_spec = NULL;
1518 rbd_dev->parent_overlap = 0;
1519}
1520
Alex Elderbf0d5f502012-11-22 00:00:08 -06001521/*
Alex Eldera2acd002013-05-08 22:50:04 -05001522 * Parent image reference counting is used to determine when an
1523 * image's parent fields can be safely torn down--after there are no
1524 * more in-flight requests to the parent image. When the last
1525 * reference is dropped, cleaning them up is safe.
1526 */
1527static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1528{
1529 int counter;
1530
1531 if (!rbd_dev->parent_spec)
1532 return;
1533
1534 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1535 if (counter > 0)
1536 return;
1537
1538 /* Last reference; clean up parent data structures */
1539
1540 if (!counter)
1541 rbd_dev_unparent(rbd_dev);
1542 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001543 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001544}
1545
1546/*
1547 * If an image has a non-zero parent overlap, get a reference to its
1548 * parent.
1549 *
1550 * Returns true if the rbd device has a parent with a non-zero
1551 * overlap and a reference for it was successfully taken, or
1552 * false otherwise.
1553 */
1554static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1555{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001556 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001557
1558 if (!rbd_dev->parent_spec)
1559 return false;
1560
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001561 down_read(&rbd_dev->header_rwsem);
1562 if (rbd_dev->parent_overlap)
1563 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1564 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001565
1566 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001567 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001568
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001569 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001570}
1571
Alex Elderbf0d5f502012-11-22 00:00:08 -06001572/*
1573 * Caller is responsible for filling in the list of object requests
1574 * that comprises the image request, and the Linux request pointer
1575 * (if there is one).
1576 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001577static struct rbd_img_request *rbd_img_request_create(
1578 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001579 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001580 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001581{
1582 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001583
Ilya Dryomova0c58952018-01-22 16:03:06 +01001584 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001585 if (!img_request)
1586 return NULL;
1587
Alex Elderbf0d5f502012-11-22 00:00:08 -06001588 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001589 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001590 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001591 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001592 else
1593 img_request->snapc = snapc;
1594
Alex Eldera2acd002013-05-08 22:50:04 -05001595 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001596 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001597
Alex Elderbf0d5f502012-11-22 00:00:08 -06001598 spin_lock_init(&img_request->completion_lock);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001599 INIT_LIST_HEAD(&img_request->object_extents);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001600 kref_init(&img_request->kref);
1601
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001602 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1603 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001604 return img_request;
1605}
1606
1607static void rbd_img_request_destroy(struct kref *kref)
1608{
1609 struct rbd_img_request *img_request;
1610 struct rbd_obj_request *obj_request;
1611 struct rbd_obj_request *next_obj_request;
1612
1613 img_request = container_of(kref, struct rbd_img_request, kref);
1614
Alex Elder37206ee2013-02-20 17:32:08 -06001615 dout("%s: img %p\n", __func__, img_request);
1616
Alex Elderbf0d5f502012-11-22 00:00:08 -06001617 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1618 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001619 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001620
Alex Eldera2acd002013-05-08 22:50:04 -05001621 if (img_request_layered_test(img_request)) {
1622 img_request_layered_clear(img_request);
1623 rbd_dev_parent_put(img_request->rbd_dev);
1624 }
1625
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001626 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001627 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001628
Alex Elder1c2a9df2013-05-01 12:43:03 -05001629 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001630}
1631
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001632static void prune_extents(struct ceph_file_extent *img_extents,
1633 u32 *num_img_extents, u64 overlap)
1634{
1635 u32 cnt = *num_img_extents;
1636
1637 /* drop extents completely beyond the overlap */
1638 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1639 cnt--;
1640
1641 if (cnt) {
1642 struct ceph_file_extent *ex = &img_extents[cnt - 1];
1643
1644 /* trim final overlapping extent */
1645 if (ex->fe_off + ex->fe_len > overlap)
1646 ex->fe_len = overlap - ex->fe_off;
1647 }
1648
1649 *num_img_extents = cnt;
1650}
1651
1652/*
1653 * Determine the byte range(s) covered by either just the object extent
1654 * or the entire object in the parent image.
1655 */
1656static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1657 bool entire)
1658{
1659 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1660 int ret;
1661
1662 if (!rbd_dev->parent_overlap)
1663 return 0;
1664
1665 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1666 entire ? 0 : obj_req->ex.oe_off,
1667 entire ? rbd_dev->layout.object_size :
1668 obj_req->ex.oe_len,
1669 &obj_req->img_extents,
1670 &obj_req->num_img_extents);
1671 if (ret)
1672 return ret;
1673
1674 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1675 rbd_dev->parent_overlap);
1676 return 0;
1677}
1678
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001679static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1680{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001681 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001682 case OBJ_REQUEST_BIO:
1683 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1684 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001685 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001686 break;
1687 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001688 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001689 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001690 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001691 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001692 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1693 &obj_req->bvec_pos);
1694 break;
1695 default:
1696 rbd_assert(0);
1697 }
1698}
1699
1700static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1701{
Ilya Dryomova162b302018-01-30 17:52:10 +01001702 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001703 if (!obj_req->osd_req)
1704 return -ENOMEM;
1705
1706 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001707 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001708 rbd_osd_req_setup_data(obj_req, 0);
1709
1710 rbd_osd_req_format_read(obj_req);
1711 return 0;
1712}
1713
1714static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1715 unsigned int which)
1716{
1717 struct page **pages;
1718
1719 /*
1720 * The response data for a STAT call consists of:
1721 * le64 length;
1722 * struct {
1723 * le32 tv_sec;
1724 * le32 tv_nsec;
1725 * } mtime;
1726 */
1727 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1728 if (IS_ERR(pages))
1729 return PTR_ERR(pages);
1730
1731 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1732 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1733 8 + sizeof(struct ceph_timespec),
1734 0, false, true);
1735 return 0;
1736}
1737
1738static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1739 unsigned int which)
1740{
1741 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1742 u16 opcode;
1743
1744 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
1745 rbd_dev->layout.object_size,
1746 rbd_dev->layout.object_size);
1747
1748 if (rbd_obj_is_entire(obj_req))
1749 opcode = CEPH_OSD_OP_WRITEFULL;
1750 else
1751 opcode = CEPH_OSD_OP_WRITE;
1752
1753 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001754 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001755 rbd_osd_req_setup_data(obj_req, which++);
1756
1757 rbd_assert(which == obj_req->osd_req->r_num_ops);
1758 rbd_osd_req_format_write(obj_req);
1759}
1760
1761static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
1762{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001763 unsigned int num_osd_ops, which = 0;
1764 int ret;
1765
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001766 /* reverse map the entire object onto the parent */
1767 ret = rbd_obj_calc_img_extents(obj_req, true);
1768 if (ret)
1769 return ret;
1770
1771 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001772 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1773 num_osd_ops = 3; /* stat + setallochint + write/writefull */
1774 } else {
1775 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1776 num_osd_ops = 2; /* setallochint + write/writefull */
1777 }
1778
Ilya Dryomova162b302018-01-30 17:52:10 +01001779 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001780 if (!obj_req->osd_req)
1781 return -ENOMEM;
1782
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001783 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001784 ret = __rbd_obj_setup_stat(obj_req, which++);
1785 if (ret)
1786 return ret;
1787 }
1788
1789 __rbd_obj_setup_write(obj_req, which);
1790 return 0;
1791}
1792
1793static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
1794 unsigned int which)
1795{
1796 u16 opcode;
1797
1798 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001799 if (obj_req->num_img_extents) {
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001800 osd_req_op_init(obj_req->osd_req, which++,
1801 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001802 opcode = CEPH_OSD_OP_TRUNCATE;
1803 } else {
1804 osd_req_op_init(obj_req->osd_req, which++,
1805 CEPH_OSD_OP_DELETE, 0);
1806 opcode = 0;
1807 }
1808 } else if (rbd_obj_is_tail(obj_req)) {
1809 opcode = CEPH_OSD_OP_TRUNCATE;
1810 } else {
1811 opcode = CEPH_OSD_OP_ZERO;
1812 }
1813
1814 if (opcode)
1815 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001816 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001817 0, 0);
1818
1819 rbd_assert(which == obj_req->osd_req->r_num_ops);
1820 rbd_osd_req_format_write(obj_req);
1821}
1822
1823static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1824{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001825 unsigned int num_osd_ops, which = 0;
1826 int ret;
1827
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001828 /* reverse map the entire object onto the parent */
1829 ret = rbd_obj_calc_img_extents(obj_req, true);
1830 if (ret)
1831 return ret;
1832
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001833 if (rbd_obj_is_entire(obj_req)) {
1834 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001835 if (obj_req->num_img_extents)
1836 num_osd_ops = 2; /* create + truncate */
1837 else
1838 num_osd_ops = 1; /* delete */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001839 } else {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001840 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001841 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1842 num_osd_ops = 2; /* stat + truncate/zero */
1843 } else {
1844 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1845 num_osd_ops = 1; /* truncate/zero */
1846 }
1847 }
1848
Ilya Dryomova162b302018-01-30 17:52:10 +01001849 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001850 if (!obj_req->osd_req)
1851 return -ENOMEM;
1852
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001853 if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001854 ret = __rbd_obj_setup_stat(obj_req, which++);
1855 if (ret)
1856 return ret;
1857 }
1858
1859 __rbd_obj_setup_discard(obj_req, which);
1860 return 0;
1861}
1862
1863/*
1864 * For each object request in @img_req, allocate an OSD request, add
1865 * individual OSD ops and prepare them for submission. The number of
1866 * OSD ops depends on op_type and the overlap point (if any).
1867 */
1868static int __rbd_img_fill_request(struct rbd_img_request *img_req)
1869{
1870 struct rbd_obj_request *obj_req;
1871 int ret;
1872
1873 for_each_obj_request(img_req, obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001874 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001875 case OBJ_OP_READ:
1876 ret = rbd_obj_setup_read(obj_req);
1877 break;
1878 case OBJ_OP_WRITE:
1879 ret = rbd_obj_setup_write(obj_req);
1880 break;
1881 case OBJ_OP_DISCARD:
1882 ret = rbd_obj_setup_discard(obj_req);
1883 break;
1884 default:
1885 rbd_assert(0);
1886 }
1887 if (ret)
1888 return ret;
1889 }
1890
1891 return 0;
1892}
1893
Ilya Dryomov5a237812018-02-06 19:26:34 +01001894union rbd_img_fill_iter {
1895 struct ceph_bio_iter bio_iter;
1896 struct ceph_bvec_iter bvec_iter;
1897};
1898
1899struct rbd_img_fill_ctx {
1900 enum obj_request_type pos_type;
1901 union rbd_img_fill_iter *pos;
1902 union rbd_img_fill_iter iter;
1903 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01001904 ceph_object_extent_fn_t count_fn;
1905 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01001906};
1907
1908static struct ceph_object_extent *alloc_object_extent(void *arg)
1909{
1910 struct rbd_img_request *img_req = arg;
1911 struct rbd_obj_request *obj_req;
1912
1913 obj_req = rbd_obj_request_create();
1914 if (!obj_req)
1915 return NULL;
1916
1917 rbd_img_obj_request_add(img_req, obj_req);
1918 return &obj_req->ex;
1919}
1920
1921/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01001922 * While su != os && sc == 1 is technically not fancy (it's the same
1923 * layout as su == os && sc == 1), we can't use the nocopy path for it
1924 * because ->set_pos_fn() should be called only once per object.
1925 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
1926 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01001927 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001928static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
1929{
1930 return l->stripe_unit != l->object_size;
1931}
1932
1933static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
1934 struct ceph_file_extent *img_extents,
1935 u32 num_img_extents,
1936 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01001937{
1938 u32 i;
1939 int ret;
1940
1941 img_req->data_type = fctx->pos_type;
1942
1943 /*
1944 * Create object requests and set each object request's starting
1945 * position in the provided bio (list) or bio_vec array.
1946 */
1947 fctx->iter = *fctx->pos;
1948 for (i = 0; i < num_img_extents; i++) {
1949 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
1950 img_extents[i].fe_off,
1951 img_extents[i].fe_len,
1952 &img_req->object_extents,
1953 alloc_object_extent, img_req,
1954 fctx->set_pos_fn, &fctx->iter);
1955 if (ret)
1956 return ret;
1957 }
1958
1959 return __rbd_img_fill_request(img_req);
1960}
1961
Ilya Dryomovafb97882018-02-06 19:26:35 +01001962/*
1963 * Map a list of image extents to a list of object extents, create the
1964 * corresponding object requests (normally each to a different object,
1965 * but not always) and add them to @img_req. For each object request,
1966 * set up its data descriptor to point to the corresponding chunk(s) of
1967 * @fctx->pos data buffer.
1968 *
1969 * Because ceph_file_to_extents() will merge adjacent object extents
1970 * together, each object request's data descriptor may point to multiple
1971 * different chunks of @fctx->pos data buffer.
1972 *
1973 * @fctx->pos data buffer is assumed to be large enough.
1974 */
1975static int rbd_img_fill_request(struct rbd_img_request *img_req,
1976 struct ceph_file_extent *img_extents,
1977 u32 num_img_extents,
1978 struct rbd_img_fill_ctx *fctx)
1979{
1980 struct rbd_device *rbd_dev = img_req->rbd_dev;
1981 struct rbd_obj_request *obj_req;
1982 u32 i;
1983 int ret;
1984
1985 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
1986 !rbd_layout_is_fancy(&rbd_dev->layout))
1987 return rbd_img_fill_request_nocopy(img_req, img_extents,
1988 num_img_extents, fctx);
1989
1990 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
1991
1992 /*
1993 * Create object requests and determine ->bvec_count for each object
1994 * request. Note that ->bvec_count sum over all object requests may
1995 * be greater than the number of bio_vecs in the provided bio (list)
1996 * or bio_vec array because when mapped, those bio_vecs can straddle
1997 * stripe unit boundaries.
1998 */
1999 fctx->iter = *fctx->pos;
2000 for (i = 0; i < num_img_extents; i++) {
2001 ret = ceph_file_to_extents(&rbd_dev->layout,
2002 img_extents[i].fe_off,
2003 img_extents[i].fe_len,
2004 &img_req->object_extents,
2005 alloc_object_extent, img_req,
2006 fctx->count_fn, &fctx->iter);
2007 if (ret)
2008 return ret;
2009 }
2010
2011 for_each_obj_request(img_req, obj_req) {
2012 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2013 sizeof(*obj_req->bvec_pos.bvecs),
2014 GFP_NOIO);
2015 if (!obj_req->bvec_pos.bvecs)
2016 return -ENOMEM;
2017 }
2018
2019 /*
2020 * Fill in each object request's private bio_vec array, splitting and
2021 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2022 */
2023 fctx->iter = *fctx->pos;
2024 for (i = 0; i < num_img_extents; i++) {
2025 ret = ceph_iterate_extents(&rbd_dev->layout,
2026 img_extents[i].fe_off,
2027 img_extents[i].fe_len,
2028 &img_req->object_extents,
2029 fctx->copy_fn, &fctx->iter);
2030 if (ret)
2031 return ret;
2032 }
2033
2034 return __rbd_img_fill_request(img_req);
2035}
2036
Ilya Dryomov5a237812018-02-06 19:26:34 +01002037static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2038 u64 off, u64 len)
2039{
2040 struct ceph_file_extent ex = { off, len };
2041 union rbd_img_fill_iter dummy;
2042 struct rbd_img_fill_ctx fctx = {
2043 .pos_type = OBJ_REQUEST_NODATA,
2044 .pos = &dummy,
2045 };
2046
2047 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2048}
2049
2050static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2051{
2052 struct rbd_obj_request *obj_req =
2053 container_of(ex, struct rbd_obj_request, ex);
2054 struct ceph_bio_iter *it = arg;
2055
2056 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2057 obj_req->bio_pos = *it;
2058 ceph_bio_iter_advance(it, bytes);
2059}
2060
Ilya Dryomovafb97882018-02-06 19:26:35 +01002061static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2062{
2063 struct rbd_obj_request *obj_req =
2064 container_of(ex, struct rbd_obj_request, ex);
2065 struct ceph_bio_iter *it = arg;
2066
2067 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2068 ceph_bio_iter_advance_step(it, bytes, ({
2069 obj_req->bvec_count++;
2070 }));
2071
2072}
2073
2074static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2075{
2076 struct rbd_obj_request *obj_req =
2077 container_of(ex, struct rbd_obj_request, ex);
2078 struct ceph_bio_iter *it = arg;
2079
2080 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2081 ceph_bio_iter_advance_step(it, bytes, ({
2082 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2083 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2084 }));
2085}
2086
Ilya Dryomov5a237812018-02-06 19:26:34 +01002087static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2088 struct ceph_file_extent *img_extents,
2089 u32 num_img_extents,
2090 struct ceph_bio_iter *bio_pos)
2091{
2092 struct rbd_img_fill_ctx fctx = {
2093 .pos_type = OBJ_REQUEST_BIO,
2094 .pos = (union rbd_img_fill_iter *)bio_pos,
2095 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002096 .count_fn = count_bio_bvecs,
2097 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002098 };
2099
2100 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2101 &fctx);
2102}
2103
2104static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2105 u64 off, u64 len, struct bio *bio)
2106{
2107 struct ceph_file_extent ex = { off, len };
2108 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2109
2110 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2111}
2112
2113static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2114{
2115 struct rbd_obj_request *obj_req =
2116 container_of(ex, struct rbd_obj_request, ex);
2117 struct ceph_bvec_iter *it = arg;
2118
2119 obj_req->bvec_pos = *it;
2120 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2121 ceph_bvec_iter_advance(it, bytes);
2122}
2123
Ilya Dryomovafb97882018-02-06 19:26:35 +01002124static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2125{
2126 struct rbd_obj_request *obj_req =
2127 container_of(ex, struct rbd_obj_request, ex);
2128 struct ceph_bvec_iter *it = arg;
2129
2130 ceph_bvec_iter_advance_step(it, bytes, ({
2131 obj_req->bvec_count++;
2132 }));
2133}
2134
2135static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2136{
2137 struct rbd_obj_request *obj_req =
2138 container_of(ex, struct rbd_obj_request, ex);
2139 struct ceph_bvec_iter *it = arg;
2140
2141 ceph_bvec_iter_advance_step(it, bytes, ({
2142 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2143 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2144 }));
2145}
2146
Ilya Dryomov5a237812018-02-06 19:26:34 +01002147static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2148 struct ceph_file_extent *img_extents,
2149 u32 num_img_extents,
2150 struct ceph_bvec_iter *bvec_pos)
2151{
2152 struct rbd_img_fill_ctx fctx = {
2153 .pos_type = OBJ_REQUEST_BVECS,
2154 .pos = (union rbd_img_fill_iter *)bvec_pos,
2155 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002156 .count_fn = count_bvecs,
2157 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002158 };
2159
2160 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2161 &fctx);
2162}
2163
2164static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2165 struct ceph_file_extent *img_extents,
2166 u32 num_img_extents,
2167 struct bio_vec *bvecs)
2168{
2169 struct ceph_bvec_iter it = {
2170 .bvecs = bvecs,
2171 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2172 num_img_extents) },
2173 };
2174
2175 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2176 &it);
2177}
2178
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002179static void rbd_img_request_submit(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002180{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002181 struct rbd_obj_request *obj_request;
2182
Alex Elder37206ee2013-02-20 17:32:08 -06002183 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002184
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002185 rbd_img_request_get(img_request);
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002186 for_each_obj_request(img_request, obj_request)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002187 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002188
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002189 rbd_img_request_put(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002190}
2191
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002192static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002193{
2194 struct rbd_img_request *img_req = obj_req->img_request;
2195 struct rbd_img_request *child_img_req;
2196 int ret;
2197
Ilya Dryomove93aca02018-02-06 19:26:35 +01002198 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2199 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002200 if (!child_img_req)
2201 return -ENOMEM;
2202
Ilya Dryomove93aca02018-02-06 19:26:35 +01002203 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2204 child_img_req->obj_request = obj_req;
2205
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002206 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002207 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002208 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002209 ret = __rbd_img_fill_from_bio(child_img_req,
2210 obj_req->img_extents,
2211 obj_req->num_img_extents,
2212 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002213 break;
2214 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002215 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002216 ret = __rbd_img_fill_from_bvecs(child_img_req,
2217 obj_req->img_extents,
2218 obj_req->num_img_extents,
2219 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002220 break;
2221 default:
2222 rbd_assert(0);
2223 }
2224 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002225 ret = rbd_img_fill_from_bvecs(child_img_req,
2226 obj_req->img_extents,
2227 obj_req->num_img_extents,
2228 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002229 }
2230 if (ret) {
2231 rbd_img_request_put(child_img_req);
2232 return ret;
2233 }
2234
2235 rbd_img_request_submit(child_img_req);
2236 return 0;
2237}
2238
2239static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2240{
2241 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2242 int ret;
2243
2244 if (obj_req->result == -ENOENT &&
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002245 rbd_dev->parent_overlap && !obj_req->tried_parent) {
2246 /* reverse map this object extent onto the parent */
2247 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002248 if (ret) {
2249 obj_req->result = ret;
2250 return true;
2251 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002252
2253 if (obj_req->num_img_extents) {
2254 obj_req->tried_parent = true;
2255 ret = rbd_obj_read_from_parent(obj_req);
2256 if (ret) {
2257 obj_req->result = ret;
2258 return true;
2259 }
2260 return false;
2261 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002262 }
2263
2264 /*
2265 * -ENOENT means a hole in the image -- zero-fill the entire
2266 * length of the request. A short read also implies zero-fill
2267 * to the end of the request. In both cases we update xferred
2268 * count to indicate the whole request was satisfied.
2269 */
2270 if (obj_req->result == -ENOENT ||
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002271 (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002272 rbd_assert(!obj_req->xferred || !obj_req->result);
2273 rbd_obj_zero_range(obj_req, obj_req->xferred,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002274 obj_req->ex.oe_len - obj_req->xferred);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002275 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002276 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002277 }
2278
2279 return true;
2280}
2281
2282/*
2283 * copyup_bvecs pages are never highmem pages
2284 */
2285static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2286{
2287 struct ceph_bvec_iter it = {
2288 .bvecs = bvecs,
2289 .iter = { .bi_size = bytes },
2290 };
2291
2292 ceph_bvec_iter_advance_step(&it, bytes, ({
2293 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2294 bv.bv_len))
2295 return false;
2296 }));
2297 return true;
2298}
2299
2300static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2301{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002302 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
2303
2304 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2305 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2306 rbd_osd_req_destroy(obj_req->osd_req);
2307
2308 /*
2309 * Create a copyup request with the same number of OSD ops as
2310 * the original request. The original request was stat + op(s),
2311 * the new copyup request will be copyup + the same op(s).
2312 */
Ilya Dryomova162b302018-01-30 17:52:10 +01002313 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002314 if (!obj_req->osd_req)
2315 return -ENOMEM;
2316
2317 /*
2318 * Only send non-zero copyup data to save some I/O and network
2319 * bandwidth -- zero copyup data is equivalent to the object not
2320 * existing.
2321 */
2322 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2323 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2324 bytes = 0;
2325 }
2326
2327 osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
2328 "copyup");
2329 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
2330 obj_req->copyup_bvecs, bytes);
2331
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002332 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002333 case OBJ_OP_WRITE:
2334 __rbd_obj_setup_write(obj_req, 1);
2335 break;
2336 case OBJ_OP_DISCARD:
2337 rbd_assert(!rbd_obj_is_entire(obj_req));
2338 __rbd_obj_setup_discard(obj_req, 1);
2339 break;
2340 default:
2341 rbd_assert(0);
2342 }
2343
2344 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002345 return 0;
2346}
2347
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002348static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2349{
2350 u32 i;
2351
2352 rbd_assert(!obj_req->copyup_bvecs);
2353 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2354 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2355 sizeof(*obj_req->copyup_bvecs),
2356 GFP_NOIO);
2357 if (!obj_req->copyup_bvecs)
2358 return -ENOMEM;
2359
2360 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2361 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2362
2363 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2364 if (!obj_req->copyup_bvecs[i].bv_page)
2365 return -ENOMEM;
2366
2367 obj_req->copyup_bvecs[i].bv_offset = 0;
2368 obj_req->copyup_bvecs[i].bv_len = len;
2369 obj_overlap -= len;
2370 }
2371
2372 rbd_assert(!obj_overlap);
2373 return 0;
2374}
2375
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002376static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2377{
2378 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002379 int ret;
2380
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002381 rbd_assert(obj_req->num_img_extents);
2382 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2383 rbd_dev->parent_overlap);
2384 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002385 /*
2386 * The overlap has become 0 (most likely because the
2387 * image has been flattened). Use rbd_obj_issue_copyup()
2388 * to re-submit the original write request -- the copyup
2389 * operation itself will be a no-op, since someone must
2390 * have populated the child object while we weren't
2391 * looking. Move to WRITE_FLAT state as we'll be done
2392 * with the operation once the null copyup completes.
2393 */
2394 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2395 return rbd_obj_issue_copyup(obj_req, 0);
2396 }
2397
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002398 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002399 if (ret)
2400 return ret;
2401
2402 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002403 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002404}
2405
2406static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2407{
2408 int ret;
2409
2410again:
2411 switch (obj_req->write_state) {
2412 case RBD_OBJ_WRITE_GUARD:
2413 rbd_assert(!obj_req->xferred);
2414 if (obj_req->result == -ENOENT) {
2415 /*
2416 * The target object doesn't exist. Read the data for
2417 * the entire target object up to the overlap point (if
2418 * any) from the parent, so we can use it for a copyup.
2419 */
2420 ret = rbd_obj_handle_write_guard(obj_req);
2421 if (ret) {
2422 obj_req->result = ret;
2423 return true;
2424 }
2425 return false;
2426 }
2427 /* fall through */
2428 case RBD_OBJ_WRITE_FLAT:
2429 if (!obj_req->result)
2430 /*
2431 * There is no such thing as a successful short
2432 * write -- indicate the whole request was satisfied.
2433 */
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002434 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002435 return true;
2436 case RBD_OBJ_WRITE_COPYUP:
2437 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2438 if (obj_req->result)
2439 goto again;
2440
2441 rbd_assert(obj_req->xferred);
2442 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2443 if (ret) {
2444 obj_req->result = ret;
2445 return true;
2446 }
2447 return false;
2448 default:
2449 rbd_assert(0);
2450 }
2451}
2452
2453/*
2454 * Returns true if @obj_req is completed, or false otherwise.
2455 */
2456static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2457{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002458 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002459 case OBJ_OP_READ:
2460 return rbd_obj_handle_read(obj_req);
2461 case OBJ_OP_WRITE:
2462 return rbd_obj_handle_write(obj_req);
2463 case OBJ_OP_DISCARD:
2464 if (rbd_obj_handle_write(obj_req)) {
2465 /*
2466 * Hide -ENOENT from delete/truncate/zero -- discarding
2467 * a non-existent object is not a problem.
2468 */
2469 if (obj_req->result == -ENOENT) {
2470 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002471 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002472 }
2473 return true;
2474 }
2475 return false;
2476 default:
2477 rbd_assert(0);
2478 }
2479}
2480
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002481static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2482{
2483 struct rbd_img_request *img_req = obj_req->img_request;
2484
2485 rbd_assert((!obj_req->result &&
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002486 obj_req->xferred == obj_req->ex.oe_len) ||
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002487 (obj_req->result < 0 && !obj_req->xferred));
2488 if (!obj_req->result) {
2489 img_req->xferred += obj_req->xferred;
2490 return;
2491 }
2492
2493 rbd_warn(img_req->rbd_dev,
2494 "%s at objno %llu %llu~%llu result %d xferred %llu",
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002495 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2496 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002497 obj_req->xferred);
2498 if (!img_req->result) {
2499 img_req->result = obj_req->result;
2500 img_req->xferred = 0;
2501 }
2502}
2503
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002504static void rbd_img_end_child_request(struct rbd_img_request *img_req)
2505{
2506 struct rbd_obj_request *obj_req = img_req->obj_request;
2507
2508 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002509 rbd_assert((!img_req->result &&
2510 img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
2511 (img_req->result < 0 && !img_req->xferred));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002512
2513 obj_req->result = img_req->result;
2514 obj_req->xferred = img_req->xferred;
2515 rbd_img_request_put(img_req);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002516}
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002517
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002518static void rbd_img_end_request(struct rbd_img_request *img_req)
2519{
2520 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2521 rbd_assert((!img_req->result &&
2522 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2523 (img_req->result < 0 && !img_req->xferred));
2524
2525 blk_mq_end_request(img_req->rq,
2526 errno_to_blk_status(img_req->result));
2527 rbd_img_request_put(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002528}
2529
2530static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2531{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002532 struct rbd_img_request *img_req;
2533
2534again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002535 if (!__rbd_obj_handle_request(obj_req))
2536 return;
2537
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002538 img_req = obj_req->img_request;
2539 spin_lock(&img_req->completion_lock);
2540 rbd_obj_end_request(obj_req);
2541 rbd_assert(img_req->pending_count);
2542 if (--img_req->pending_count) {
2543 spin_unlock(&img_req->completion_lock);
2544 return;
2545 }
2546
2547 spin_unlock(&img_req->completion_lock);
2548 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2549 obj_req = img_req->obj_request;
2550 rbd_img_end_child_request(img_req);
2551 goto again;
2552 }
2553 rbd_img_end_request(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002554}
2555
Ilya Dryomoved95b212016-08-12 16:40:02 +02002556static const struct rbd_client_id rbd_empty_cid;
2557
2558static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2559 const struct rbd_client_id *rhs)
2560{
2561 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2562}
2563
2564static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2565{
2566 struct rbd_client_id cid;
2567
2568 mutex_lock(&rbd_dev->watch_mutex);
2569 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2570 cid.handle = rbd_dev->watch_cookie;
2571 mutex_unlock(&rbd_dev->watch_mutex);
2572 return cid;
2573}
2574
2575/*
2576 * lock_rwsem must be held for write
2577 */
2578static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2579 const struct rbd_client_id *cid)
2580{
2581 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2582 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2583 cid->gid, cid->handle);
2584 rbd_dev->owner_cid = *cid; /* struct */
2585}
2586
2587static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2588{
2589 mutex_lock(&rbd_dev->watch_mutex);
2590 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2591 mutex_unlock(&rbd_dev->watch_mutex);
2592}
2593
Florian Margaineedd8ca82017-12-13 16:43:59 +01002594static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2595{
2596 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2597
2598 strcpy(rbd_dev->lock_cookie, cookie);
2599 rbd_set_owner_cid(rbd_dev, &cid);
2600 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2601}
2602
Ilya Dryomoved95b212016-08-12 16:40:02 +02002603/*
2604 * lock_rwsem must be held for write
2605 */
2606static int rbd_lock(struct rbd_device *rbd_dev)
2607{
2608 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002609 char cookie[32];
2610 int ret;
2611
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002612 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2613 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002614
2615 format_lock_cookie(rbd_dev, cookie);
2616 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2617 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2618 RBD_LOCK_TAG, "", 0);
2619 if (ret)
2620 return ret;
2621
2622 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002623 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002624 return 0;
2625}
2626
2627/*
2628 * lock_rwsem must be held for write
2629 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002630static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002631{
2632 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002633 int ret;
2634
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002635 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2636 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002637
Ilya Dryomoved95b212016-08-12 16:40:02 +02002638 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002639 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002640 if (ret && ret != -ENOENT)
2641 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002642
Ilya Dryomovbbead742017-04-13 12:17:38 +02002643 /* treat errors as the image is unlocked */
2644 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002645 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002646 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2647 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002648}
2649
2650static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2651 enum rbd_notify_op notify_op,
2652 struct page ***preply_pages,
2653 size_t *preply_len)
2654{
2655 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2656 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2657 int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
2658 char buf[buf_size];
2659 void *p = buf;
2660
2661 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2662
2663 /* encode *LockPayload NotifyMessage (op + ClientId) */
2664 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2665 ceph_encode_32(&p, notify_op);
2666 ceph_encode_64(&p, cid.gid);
2667 ceph_encode_64(&p, cid.handle);
2668
2669 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2670 &rbd_dev->header_oloc, buf, buf_size,
2671 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2672}
2673
2674static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2675 enum rbd_notify_op notify_op)
2676{
2677 struct page **reply_pages;
2678 size_t reply_len;
2679
2680 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2681 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2682}
2683
2684static void rbd_notify_acquired_lock(struct work_struct *work)
2685{
2686 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2687 acquired_lock_work);
2688
2689 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2690}
2691
2692static void rbd_notify_released_lock(struct work_struct *work)
2693{
2694 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2695 released_lock_work);
2696
2697 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2698}
2699
2700static int rbd_request_lock(struct rbd_device *rbd_dev)
2701{
2702 struct page **reply_pages;
2703 size_t reply_len;
2704 bool lock_owner_responded = false;
2705 int ret;
2706
2707 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2708
2709 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2710 &reply_pages, &reply_len);
2711 if (ret && ret != -ETIMEDOUT) {
2712 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2713 goto out;
2714 }
2715
2716 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2717 void *p = page_address(reply_pages[0]);
2718 void *const end = p + reply_len;
2719 u32 n;
2720
2721 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2722 while (n--) {
2723 u8 struct_v;
2724 u32 len;
2725
2726 ceph_decode_need(&p, end, 8 + 8, e_inval);
2727 p += 8 + 8; /* skip gid and cookie */
2728
2729 ceph_decode_32_safe(&p, end, len, e_inval);
2730 if (!len)
2731 continue;
2732
2733 if (lock_owner_responded) {
2734 rbd_warn(rbd_dev,
2735 "duplicate lock owners detected");
2736 ret = -EIO;
2737 goto out;
2738 }
2739
2740 lock_owner_responded = true;
2741 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2742 &struct_v, &len);
2743 if (ret) {
2744 rbd_warn(rbd_dev,
2745 "failed to decode ResponseMessage: %d",
2746 ret);
2747 goto e_inval;
2748 }
2749
2750 ret = ceph_decode_32(&p);
2751 }
2752 }
2753
2754 if (!lock_owner_responded) {
2755 rbd_warn(rbd_dev, "no lock owners detected");
2756 ret = -ETIMEDOUT;
2757 }
2758
2759out:
2760 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2761 return ret;
2762
2763e_inval:
2764 ret = -EINVAL;
2765 goto out;
2766}
2767
2768static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2769{
2770 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2771
2772 cancel_delayed_work(&rbd_dev->lock_dwork);
2773 if (wake_all)
2774 wake_up_all(&rbd_dev->lock_waitq);
2775 else
2776 wake_up(&rbd_dev->lock_waitq);
2777}
2778
2779static int get_lock_owner_info(struct rbd_device *rbd_dev,
2780 struct ceph_locker **lockers, u32 *num_lockers)
2781{
2782 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2783 u8 lock_type;
2784 char *lock_tag;
2785 int ret;
2786
2787 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2788
2789 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2790 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2791 &lock_type, &lock_tag, lockers, num_lockers);
2792 if (ret)
2793 return ret;
2794
2795 if (*num_lockers == 0) {
2796 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2797 goto out;
2798 }
2799
2800 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2801 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2802 lock_tag);
2803 ret = -EBUSY;
2804 goto out;
2805 }
2806
2807 if (lock_type == CEPH_CLS_LOCK_SHARED) {
2808 rbd_warn(rbd_dev, "shared lock type detected");
2809 ret = -EBUSY;
2810 goto out;
2811 }
2812
2813 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2814 strlen(RBD_LOCK_COOKIE_PREFIX))) {
2815 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2816 (*lockers)[0].id.cookie);
2817 ret = -EBUSY;
2818 goto out;
2819 }
2820
2821out:
2822 kfree(lock_tag);
2823 return ret;
2824}
2825
2826static int find_watcher(struct rbd_device *rbd_dev,
2827 const struct ceph_locker *locker)
2828{
2829 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2830 struct ceph_watch_item *watchers;
2831 u32 num_watchers;
2832 u64 cookie;
2833 int i;
2834 int ret;
2835
2836 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2837 &rbd_dev->header_oloc, &watchers,
2838 &num_watchers);
2839 if (ret)
2840 return ret;
2841
2842 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2843 for (i = 0; i < num_watchers; i++) {
2844 if (!memcmp(&watchers[i].addr, &locker->info.addr,
2845 sizeof(locker->info.addr)) &&
2846 watchers[i].cookie == cookie) {
2847 struct rbd_client_id cid = {
2848 .gid = le64_to_cpu(watchers[i].name.num),
2849 .handle = cookie,
2850 };
2851
2852 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
2853 rbd_dev, cid.gid, cid.handle);
2854 rbd_set_owner_cid(rbd_dev, &cid);
2855 ret = 1;
2856 goto out;
2857 }
2858 }
2859
2860 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
2861 ret = 0;
2862out:
2863 kfree(watchers);
2864 return ret;
2865}
2866
2867/*
2868 * lock_rwsem must be held for write
2869 */
2870static int rbd_try_lock(struct rbd_device *rbd_dev)
2871{
2872 struct ceph_client *client = rbd_dev->rbd_client->client;
2873 struct ceph_locker *lockers;
2874 u32 num_lockers;
2875 int ret;
2876
2877 for (;;) {
2878 ret = rbd_lock(rbd_dev);
2879 if (ret != -EBUSY)
2880 return ret;
2881
2882 /* determine if the current lock holder is still alive */
2883 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
2884 if (ret)
2885 return ret;
2886
2887 if (num_lockers == 0)
2888 goto again;
2889
2890 ret = find_watcher(rbd_dev, lockers);
2891 if (ret) {
2892 if (ret > 0)
2893 ret = 0; /* have to request lock */
2894 goto out;
2895 }
2896
2897 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
2898 ENTITY_NAME(lockers[0].id.name));
2899
2900 ret = ceph_monc_blacklist_add(&client->monc,
2901 &lockers[0].info.addr);
2902 if (ret) {
2903 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
2904 ENTITY_NAME(lockers[0].id.name), ret);
2905 goto out;
2906 }
2907
2908 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
2909 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2910 lockers[0].id.cookie,
2911 &lockers[0].id.name);
2912 if (ret && ret != -ENOENT)
2913 goto out;
2914
2915again:
2916 ceph_free_lockers(lockers, num_lockers);
2917 }
2918
2919out:
2920 ceph_free_lockers(lockers, num_lockers);
2921 return ret;
2922}
2923
2924/*
2925 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
2926 */
2927static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
2928 int *pret)
2929{
2930 enum rbd_lock_state lock_state;
2931
2932 down_read(&rbd_dev->lock_rwsem);
2933 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
2934 rbd_dev->lock_state);
2935 if (__rbd_is_lock_owner(rbd_dev)) {
2936 lock_state = rbd_dev->lock_state;
2937 up_read(&rbd_dev->lock_rwsem);
2938 return lock_state;
2939 }
2940
2941 up_read(&rbd_dev->lock_rwsem);
2942 down_write(&rbd_dev->lock_rwsem);
2943 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
2944 rbd_dev->lock_state);
2945 if (!__rbd_is_lock_owner(rbd_dev)) {
2946 *pret = rbd_try_lock(rbd_dev);
2947 if (*pret)
2948 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
2949 }
2950
2951 lock_state = rbd_dev->lock_state;
2952 up_write(&rbd_dev->lock_rwsem);
2953 return lock_state;
2954}
2955
2956static void rbd_acquire_lock(struct work_struct *work)
2957{
2958 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
2959 struct rbd_device, lock_dwork);
2960 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08002961 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002962
2963 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2964again:
2965 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
2966 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
2967 if (lock_state == RBD_LOCK_STATE_LOCKED)
2968 wake_requests(rbd_dev, true);
2969 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
2970 rbd_dev, lock_state, ret);
2971 return;
2972 }
2973
2974 ret = rbd_request_lock(rbd_dev);
2975 if (ret == -ETIMEDOUT) {
2976 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02002977 } else if (ret == -EROFS) {
2978 rbd_warn(rbd_dev, "peer will not release lock");
2979 /*
2980 * If this is rbd_add_acquire_lock(), we want to fail
2981 * immediately -- reuse BLACKLISTED flag. Otherwise we
2982 * want to block.
2983 */
2984 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
2985 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
2986 /* wake "rbd map --exclusive" process */
2987 wake_requests(rbd_dev, false);
2988 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02002989 } else if (ret < 0) {
2990 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
2991 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
2992 RBD_RETRY_DELAY);
2993 } else {
2994 /*
2995 * lock owner acked, but resend if we don't see them
2996 * release the lock
2997 */
2998 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
2999 rbd_dev);
3000 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3001 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3002 }
3003}
3004
3005/*
3006 * lock_rwsem must be held for write
3007 */
3008static bool rbd_release_lock(struct rbd_device *rbd_dev)
3009{
3010 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3011 rbd_dev->lock_state);
3012 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3013 return false;
3014
3015 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3016 downgrade_write(&rbd_dev->lock_rwsem);
3017 /*
3018 * Ensure that all in-flight IO is flushed.
3019 *
3020 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3021 * may be shared with other devices.
3022 */
3023 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3024 up_read(&rbd_dev->lock_rwsem);
3025
3026 down_write(&rbd_dev->lock_rwsem);
3027 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3028 rbd_dev->lock_state);
3029 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3030 return false;
3031
Ilya Dryomovbbead742017-04-13 12:17:38 +02003032 rbd_unlock(rbd_dev);
3033 /*
3034 * Give others a chance to grab the lock - we would re-acquire
3035 * almost immediately if we got new IO during ceph_osdc_sync()
3036 * otherwise. We need to ack our own notifications, so this
3037 * lock_dwork will be requeued from rbd_wait_state_locked()
3038 * after wake_requests() in rbd_handle_released_lock().
3039 */
3040 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003041 return true;
3042}
3043
3044static void rbd_release_lock_work(struct work_struct *work)
3045{
3046 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3047 unlock_work);
3048
3049 down_write(&rbd_dev->lock_rwsem);
3050 rbd_release_lock(rbd_dev);
3051 up_write(&rbd_dev->lock_rwsem);
3052}
3053
3054static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3055 void **p)
3056{
3057 struct rbd_client_id cid = { 0 };
3058
3059 if (struct_v >= 2) {
3060 cid.gid = ceph_decode_64(p);
3061 cid.handle = ceph_decode_64(p);
3062 }
3063
3064 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3065 cid.handle);
3066 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3067 down_write(&rbd_dev->lock_rwsem);
3068 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3069 /*
3070 * we already know that the remote client is
3071 * the owner
3072 */
3073 up_write(&rbd_dev->lock_rwsem);
3074 return;
3075 }
3076
3077 rbd_set_owner_cid(rbd_dev, &cid);
3078 downgrade_write(&rbd_dev->lock_rwsem);
3079 } else {
3080 down_read(&rbd_dev->lock_rwsem);
3081 }
3082
3083 if (!__rbd_is_lock_owner(rbd_dev))
3084 wake_requests(rbd_dev, false);
3085 up_read(&rbd_dev->lock_rwsem);
3086}
3087
3088static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3089 void **p)
3090{
3091 struct rbd_client_id cid = { 0 };
3092
3093 if (struct_v >= 2) {
3094 cid.gid = ceph_decode_64(p);
3095 cid.handle = ceph_decode_64(p);
3096 }
3097
3098 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3099 cid.handle);
3100 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3101 down_write(&rbd_dev->lock_rwsem);
3102 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3103 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3104 __func__, rbd_dev, cid.gid, cid.handle,
3105 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3106 up_write(&rbd_dev->lock_rwsem);
3107 return;
3108 }
3109
3110 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3111 downgrade_write(&rbd_dev->lock_rwsem);
3112 } else {
3113 down_read(&rbd_dev->lock_rwsem);
3114 }
3115
3116 if (!__rbd_is_lock_owner(rbd_dev))
3117 wake_requests(rbd_dev, false);
3118 up_read(&rbd_dev->lock_rwsem);
3119}
3120
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003121/*
3122 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3123 * ResponseMessage is needed.
3124 */
3125static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3126 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003127{
3128 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3129 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003130 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003131
3132 if (struct_v >= 2) {
3133 cid.gid = ceph_decode_64(p);
3134 cid.handle = ceph_decode_64(p);
3135 }
3136
3137 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3138 cid.handle);
3139 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003140 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003141
3142 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003143 if (__rbd_is_lock_owner(rbd_dev)) {
3144 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3145 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3146 goto out_unlock;
3147
3148 /*
3149 * encode ResponseMessage(0) so the peer can detect
3150 * a missing owner
3151 */
3152 result = 0;
3153
3154 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003155 if (!rbd_dev->opts->exclusive) {
3156 dout("%s rbd_dev %p queueing unlock_work\n",
3157 __func__, rbd_dev);
3158 queue_work(rbd_dev->task_wq,
3159 &rbd_dev->unlock_work);
3160 } else {
3161 /* refuse to release the lock */
3162 result = -EROFS;
3163 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003164 }
3165 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003166
3167out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003168 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003169 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003170}
3171
3172static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3173 u64 notify_id, u64 cookie, s32 *result)
3174{
3175 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3176 int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3177 char buf[buf_size];
3178 int ret;
3179
3180 if (result) {
3181 void *p = buf;
3182
3183 /* encode ResponseMessage */
3184 ceph_start_encoding(&p, 1, 1,
3185 buf_size - CEPH_ENCODING_START_BLK_LEN);
3186 ceph_encode_32(&p, *result);
3187 } else {
3188 buf_size = 0;
3189 }
3190
3191 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3192 &rbd_dev->header_oloc, notify_id, cookie,
3193 buf, buf_size);
3194 if (ret)
3195 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3196}
3197
3198static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3199 u64 cookie)
3200{
3201 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3202 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3203}
3204
3205static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3206 u64 notify_id, u64 cookie, s32 result)
3207{
3208 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3209 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3210}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003211
3212static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3213 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003214{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003215 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003216 void *p = data;
3217 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003218 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003219 u32 len;
3220 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003221 int ret;
3222
Ilya Dryomoved95b212016-08-12 16:40:02 +02003223 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3224 __func__, rbd_dev, cookie, notify_id, data_len);
3225 if (data_len) {
3226 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3227 &struct_v, &len);
3228 if (ret) {
3229 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3230 ret);
3231 return;
3232 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003233
Ilya Dryomoved95b212016-08-12 16:40:02 +02003234 notify_op = ceph_decode_32(&p);
3235 } else {
3236 /* legacy notification for header updates */
3237 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3238 len = 0;
3239 }
Alex Elderb8d70032012-11-30 17:53:04 -06003240
Ilya Dryomoved95b212016-08-12 16:40:02 +02003241 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3242 switch (notify_op) {
3243 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3244 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3245 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3246 break;
3247 case RBD_NOTIFY_OP_RELEASED_LOCK:
3248 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3249 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3250 break;
3251 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003252 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3253 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003254 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003255 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003256 else
3257 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3258 break;
3259 case RBD_NOTIFY_OP_HEADER_UPDATE:
3260 ret = rbd_dev_refresh(rbd_dev);
3261 if (ret)
3262 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3263
3264 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3265 break;
3266 default:
3267 if (rbd_is_lock_owner(rbd_dev))
3268 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3269 cookie, -EOPNOTSUPP);
3270 else
3271 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3272 break;
3273 }
Alex Elderb8d70032012-11-30 17:53:04 -06003274}
3275
Ilya Dryomov99d16942016-08-12 16:11:41 +02003276static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3277
Ilya Dryomov922dab62016-05-26 01:15:02 +02003278static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003279{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003280 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003281
Ilya Dryomov922dab62016-05-26 01:15:02 +02003282 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003283
Ilya Dryomoved95b212016-08-12 16:40:02 +02003284 down_write(&rbd_dev->lock_rwsem);
3285 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3286 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003287
Ilya Dryomov99d16942016-08-12 16:11:41 +02003288 mutex_lock(&rbd_dev->watch_mutex);
3289 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3290 __rbd_unregister_watch(rbd_dev);
3291 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003292
Ilya Dryomov99d16942016-08-12 16:11:41 +02003293 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003294 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003295 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003296}
3297
3298/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003299 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003300 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003301static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003302{
3303 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003304 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003305
Ilya Dryomov922dab62016-05-26 01:15:02 +02003306 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003307 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003308
Ilya Dryomov922dab62016-05-26 01:15:02 +02003309 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3310 &rbd_dev->header_oloc, rbd_watch_cb,
3311 rbd_watch_errcb, rbd_dev);
3312 if (IS_ERR(handle))
3313 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003314
Ilya Dryomov922dab62016-05-26 01:15:02 +02003315 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003316 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003317}
3318
Ilya Dryomov99d16942016-08-12 16:11:41 +02003319/*
3320 * watch_mutex must be locked
3321 */
3322static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003323{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003324 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3325 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003326
Ilya Dryomov99d16942016-08-12 16:11:41 +02003327 rbd_assert(rbd_dev->watch_handle);
3328 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003329
Ilya Dryomov922dab62016-05-26 01:15:02 +02003330 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3331 if (ret)
3332 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003333
Ilya Dryomov922dab62016-05-26 01:15:02 +02003334 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003335}
3336
Ilya Dryomov99d16942016-08-12 16:11:41 +02003337static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003338{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003339 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003340
Ilya Dryomov99d16942016-08-12 16:11:41 +02003341 mutex_lock(&rbd_dev->watch_mutex);
3342 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3343 ret = __rbd_register_watch(rbd_dev);
3344 if (ret)
3345 goto out;
3346
3347 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3348 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3349
3350out:
3351 mutex_unlock(&rbd_dev->watch_mutex);
3352 return ret;
3353}
3354
3355static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3356{
3357 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3358
3359 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003360 cancel_work_sync(&rbd_dev->acquired_lock_work);
3361 cancel_work_sync(&rbd_dev->released_lock_work);
3362 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3363 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003364}
3365
3366static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3367{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003368 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003369 cancel_tasks_sync(rbd_dev);
3370
3371 mutex_lock(&rbd_dev->watch_mutex);
3372 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3373 __rbd_unregister_watch(rbd_dev);
3374 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3375 mutex_unlock(&rbd_dev->watch_mutex);
3376
Ilya Dryomov811c6682016-04-15 16:22:16 +02003377 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003378}
3379
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003380/*
3381 * lock_rwsem must be held for write
3382 */
3383static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3384{
3385 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3386 char cookie[32];
3387 int ret;
3388
3389 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3390
3391 format_lock_cookie(rbd_dev, cookie);
3392 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3393 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3394 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3395 RBD_LOCK_TAG, cookie);
3396 if (ret) {
3397 if (ret != -EOPNOTSUPP)
3398 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3399 ret);
3400
3401 /*
3402 * Lock cookie cannot be updated on older OSDs, so do
3403 * a manual release and queue an acquire.
3404 */
3405 if (rbd_release_lock(rbd_dev))
3406 queue_delayed_work(rbd_dev->task_wq,
3407 &rbd_dev->lock_dwork, 0);
3408 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003409 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003410 }
3411}
3412
Ilya Dryomov99d16942016-08-12 16:11:41 +02003413static void rbd_reregister_watch(struct work_struct *work)
3414{
3415 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3416 struct rbd_device, watch_dwork);
3417 int ret;
3418
3419 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3420
3421 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003422 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3423 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003424 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003425 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003426
3427 ret = __rbd_register_watch(rbd_dev);
3428 if (ret) {
3429 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003430 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003431 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003432 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003433 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003434 queue_delayed_work(rbd_dev->task_wq,
3435 &rbd_dev->watch_dwork,
3436 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003437 }
3438 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003439 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003440 }
3441
3442 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3443 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3444 mutex_unlock(&rbd_dev->watch_mutex);
3445
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003446 down_write(&rbd_dev->lock_rwsem);
3447 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3448 rbd_reacquire_lock(rbd_dev);
3449 up_write(&rbd_dev->lock_rwsem);
3450
Ilya Dryomov99d16942016-08-12 16:11:41 +02003451 ret = rbd_dev_refresh(rbd_dev);
3452 if (ret)
3453 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003454}
3455
Alex Elder36be9a72013-01-19 00:30:28 -06003456/*
Alex Elderf40eb342013-04-25 15:09:42 -05003457 * Synchronous osd object method call. Returns the number of bytes
3458 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003459 */
3460static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003461 struct ceph_object_id *oid,
3462 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003463 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003464 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003465 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003466 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003467 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003468{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003469 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3470 struct page *req_page = NULL;
3471 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003472 int ret;
3473
3474 /*
Alex Elder6010a452013-04-05 01:27:11 -05003475 * Method calls are ultimately read operations. The result
3476 * should placed into the inbound buffer provided. They
3477 * also supply outbound data--parameters for the object
3478 * method. Currently if this is present it will be a
3479 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003480 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003481 if (outbound) {
3482 if (outbound_size > PAGE_SIZE)
3483 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003484
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003485 req_page = alloc_page(GFP_KERNEL);
3486 if (!req_page)
3487 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003488
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003489 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003490 }
Alex Elder430c28c2013-04-03 21:32:51 -05003491
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003492 reply_page = alloc_page(GFP_KERNEL);
3493 if (!reply_page) {
3494 if (req_page)
3495 __free_page(req_page);
3496 return -ENOMEM;
3497 }
Alex Elder36be9a72013-01-19 00:30:28 -06003498
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003499 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3500 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3501 reply_page, &inbound_size);
3502 if (!ret) {
3503 memcpy(inbound, page_address(reply_page), inbound_size);
3504 ret = inbound_size;
3505 }
Alex Elder57385b52013-04-21 12:14:45 -05003506
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003507 if (req_page)
3508 __free_page(req_page);
3509 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003510 return ret;
3511}
3512
Ilya Dryomoved95b212016-08-12 16:40:02 +02003513/*
3514 * lock_rwsem must be held for read
3515 */
3516static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
3517{
3518 DEFINE_WAIT(wait);
3519
3520 do {
3521 /*
3522 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3523 * and cancel_delayed_work() in wake_requests().
3524 */
3525 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3526 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3527 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3528 TASK_UNINTERRUPTIBLE);
3529 up_read(&rbd_dev->lock_rwsem);
3530 schedule();
3531 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003532 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
3533 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
3534
Ilya Dryomoved95b212016-08-12 16:40:02 +02003535 finish_wait(&rbd_dev->lock_waitq, &wait);
3536}
3537
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003538static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003539{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003540 struct request *rq = blk_mq_rq_from_pdu(work);
3541 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003542 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003543 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003544 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3545 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003546 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003547 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003548 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003549 int result;
3550
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003551 switch (req_op(rq)) {
3552 case REQ_OP_DISCARD:
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003553 case REQ_OP_WRITE_ZEROES:
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003554 op_type = OBJ_OP_DISCARD;
3555 break;
3556 case REQ_OP_WRITE:
3557 op_type = OBJ_OP_WRITE;
3558 break;
3559 case REQ_OP_READ:
3560 op_type = OBJ_OP_READ;
3561 break;
3562 default:
3563 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003564 result = -EIO;
3565 goto err;
3566 }
3567
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003568 /* Ignore/skip any zero-length requests */
3569
3570 if (!length) {
3571 dout("%s: zero-length request\n", __func__);
3572 result = 0;
3573 goto err_rq;
3574 }
3575
Ilya Dryomov9568c932017-10-12 12:35:19 +02003576 rbd_assert(op_type == OBJ_OP_READ ||
3577 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003578
3579 /*
3580 * Quit early if the mapped snapshot no longer exists. It's
3581 * still possible the snapshot will have disappeared by the
3582 * time our request arrives at the osd, but there's no sense in
3583 * sending it if we already know.
3584 */
3585 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3586 dout("request for non-existent snapshot");
3587 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3588 result = -ENXIO;
3589 goto err_rq;
3590 }
3591
3592 if (offset && length > U64_MAX - offset + 1) {
3593 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3594 length);
3595 result = -EINVAL;
3596 goto err_rq; /* Shouldn't happen */
3597 }
3598
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003599 blk_mq_start_request(rq);
3600
Josh Durgin4e752f02014-04-08 11:12:11 -07003601 down_read(&rbd_dev->header_rwsem);
3602 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003603 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003604 snapc = rbd_dev->header.snapc;
3605 ceph_get_snap_context(snapc);
3606 }
3607 up_read(&rbd_dev->header_rwsem);
3608
3609 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003610 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003611 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003612 result = -EIO;
3613 goto err_rq;
3614 }
3615
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003616 must_be_locked =
3617 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3618 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003619 if (must_be_locked) {
3620 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003621 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
Ilya Dryomove010dd02017-04-13 12:17:39 +02003622 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3623 if (rbd_dev->opts->exclusive) {
3624 rbd_warn(rbd_dev, "exclusive lock required");
3625 result = -EROFS;
3626 goto err_unlock;
3627 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003628 rbd_wait_state_locked(rbd_dev);
Ilya Dryomove010dd02017-04-13 12:17:39 +02003629 }
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003630 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3631 result = -EBLACKLISTED;
3632 goto err_unlock;
3633 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003634 }
3635
Ilya Dryomovdfd98752018-02-06 19:26:35 +01003636 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003637 if (!img_request) {
3638 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003639 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003640 }
3641 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003642 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003643
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003644 if (op_type == OBJ_OP_DISCARD)
Ilya Dryomov5a237812018-02-06 19:26:34 +01003645 result = rbd_img_fill_nodata(img_request, offset, length);
3646 else
3647 result = rbd_img_fill_from_bio(img_request, offset, length,
3648 rq->bio);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003649 if (result)
3650 goto err_img_request;
3651
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01003652 rbd_img_request_submit(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003653 if (must_be_locked)
3654 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003655 return;
3656
3657err_img_request:
3658 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003659err_unlock:
3660 if (must_be_locked)
3661 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003662err_rq:
3663 if (result)
3664 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003665 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003666 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003667err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003668 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003669}
3670
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003671static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003672 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003673{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003674 struct request *rq = bd->rq;
3675 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003676
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003677 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003678 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003679}
3680
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003681static void rbd_free_disk(struct rbd_device *rbd_dev)
3682{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003683 blk_cleanup_queue(rbd_dev->disk->queue);
3684 blk_mq_free_tag_set(&rbd_dev->tag_set);
3685 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003686 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003687}
3688
Alex Elder788e2df2013-01-17 12:25:27 -06003689static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003690 struct ceph_object_id *oid,
3691 struct ceph_object_locator *oloc,
3692 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003693
3694{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003695 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3696 struct ceph_osd_request *req;
3697 struct page **pages;
3698 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003699 int ret;
3700
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003701 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3702 if (!req)
3703 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003704
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003705 ceph_oid_copy(&req->r_base_oid, oid);
3706 ceph_oloc_copy(&req->r_base_oloc, oloc);
3707 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003708
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003709 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06003710 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003711 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06003712
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003713 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3714 if (IS_ERR(pages)) {
3715 ret = PTR_ERR(pages);
3716 goto out_req;
3717 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003718
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003719 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3720 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3721 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003722
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003723 ceph_osdc_start_request(osdc, req, false);
3724 ret = ceph_osdc_wait_request(osdc, req);
3725 if (ret >= 0)
3726 ceph_copy_from_page_vector(pages, buf, 0, ret);
3727
3728out_req:
3729 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003730 return ret;
3731}
3732
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003733/*
Alex Elder662518b2013-05-06 09:51:29 -05003734 * Read the complete header for the given rbd device. On successful
3735 * return, the rbd_dev->header field will contain up-to-date
3736 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003737 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003738static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003739{
3740 struct rbd_image_header_ondisk *ondisk = NULL;
3741 u32 snap_count = 0;
3742 u64 names_size = 0;
3743 u32 want_count;
3744 int ret;
3745
3746 /*
3747 * The complete header will include an array of its 64-bit
3748 * snapshot ids, followed by the names of those snapshots as
3749 * a contiguous block of NUL-terminated strings. Note that
3750 * the number of snapshots could change by the time we read
3751 * it in, in which case we re-read it.
3752 */
3753 do {
3754 size_t size;
3755
3756 kfree(ondisk);
3757
3758 size = sizeof (*ondisk);
3759 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3760 size += names_size;
3761 ondisk = kmalloc(size, GFP_KERNEL);
3762 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003763 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003764
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003765 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3766 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05003767 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003768 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003769 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003770 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003771 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3772 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003773 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003774 }
3775 if (!rbd_dev_ondisk_valid(ondisk)) {
3776 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003777 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003778 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003779 }
3780
3781 names_size = le64_to_cpu(ondisk->snap_names_len);
3782 want_count = snap_count;
3783 snap_count = le32_to_cpu(ondisk->snap_count);
3784 } while (snap_count != want_count);
3785
Alex Elder662518b2013-05-06 09:51:29 -05003786 ret = rbd_header_from_disk(rbd_dev, ondisk);
3787out:
Alex Elder4156d992012-08-02 11:29:46 -05003788 kfree(ondisk);
3789
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003790 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003791}
3792
Alex Elder15228ed2013-05-01 12:43:03 -05003793/*
3794 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3795 * has disappeared from the (just updated) snapshot context.
3796 */
3797static void rbd_exists_validate(struct rbd_device *rbd_dev)
3798{
3799 u64 snap_id;
3800
3801 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3802 return;
3803
3804 snap_id = rbd_dev->spec->snap_id;
3805 if (snap_id == CEPH_NOSNAP)
3806 return;
3807
3808 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3809 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3810}
3811
Josh Durgin98752012013-08-29 17:26:31 -07003812static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3813{
3814 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07003815
3816 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02003817 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3818 * try to update its size. If REMOVING is set, updating size
3819 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07003820 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02003821 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3822 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07003823 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3824 dout("setting size to %llu sectors", (unsigned long long)size);
3825 set_capacity(rbd_dev->disk, size);
3826 revalidate_disk(rbd_dev->disk);
3827 }
3828}
3829
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003830static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003831{
Alex Eldere627db02013-05-06 07:40:30 -05003832 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003833 int ret;
3834
Alex Eldercfbf6372013-05-31 17:40:45 -05003835 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05003836 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04003837
3838 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003839 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003840 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05003841
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003842 /*
3843 * If there is a parent, see if it has disappeared due to the
3844 * mapped image getting flattened.
3845 */
3846 if (rbd_dev->parent) {
3847 ret = rbd_dev_v2_parent_info(rbd_dev);
3848 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003849 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003850 }
3851
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003852 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003853 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003854 } else {
3855 /* validate mapped snapshot's EXISTS flag */
3856 rbd_exists_validate(rbd_dev);
3857 }
Alex Elder15228ed2013-05-01 12:43:03 -05003858
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003859out:
Alex Eldercfbf6372013-05-31 17:40:45 -05003860 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003861 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07003862 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003863
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003864 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05003865}
3866
Christoph Hellwigd6296d32017-05-01 10:19:08 -06003867static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
3868 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003869{
3870 struct work_struct *work = blk_mq_rq_to_pdu(rq);
3871
3872 INIT_WORK(work, rbd_queue_workfn);
3873 return 0;
3874}
3875
Eric Biggersf363b082017-03-30 13:39:16 -07003876static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003877 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003878 .init_request = rbd_init_request,
3879};
3880
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003881static int rbd_init_disk(struct rbd_device *rbd_dev)
3882{
3883 struct gendisk *disk;
3884 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003885 u64 segment_size;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003886 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003887
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003888 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003889 disk = alloc_disk(single_major ?
3890 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3891 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003892 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003893 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003894
Alex Elderf0f8cef2012-01-29 13:57:44 -06003895 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003896 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003897 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003898 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003899 if (single_major)
3900 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003901 disk->fops = &rbd_bd_ops;
3902 disk->private_data = rbd_dev;
3903
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003904 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
3905 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003906 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003907 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003908 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003909 rbd_dev->tag_set.nr_hw_queues = 1;
3910 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
3911
3912 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
3913 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003914 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003915
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003916 q = blk_mq_init_queue(&rbd_dev->tag_set);
3917 if (IS_ERR(q)) {
3918 err = PTR_ERR(q);
3919 goto out_tag_set;
3920 }
3921
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03003922 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
3923 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06003924
Josh Durgin029bcbd2011-07-22 11:35:23 -07003925 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003926 segment_size = rbd_obj_bytes(&rbd_dev->header);
3927 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02003928 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01003929 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01003930 blk_queue_max_segment_size(q, UINT_MAX);
Alex Elder593a9e72012-02-07 12:03:37 -06003931 blk_queue_io_min(q, segment_size);
3932 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003933
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003934 /* enable the discard support */
3935 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
3936 q->limits.discard_granularity = segment_size;
Jens Axboe2bb4cd52015-07-14 08:15:12 -06003937 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003938 blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003939
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003940 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01003941 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003942
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003943 /*
3944 * disk_release() expects a queue ref from add_disk() and will
3945 * put it. Hold an extra ref until add_disk() is called.
3946 */
3947 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003948 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003949 q->queuedata = rbd_dev;
3950
3951 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003952
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003953 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003954out_tag_set:
3955 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003956out_disk:
3957 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003958 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003959}
3960
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003961/*
3962 sysfs
3963*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003964
Alex Elder593a9e72012-02-07 12:03:37 -06003965static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3966{
3967 return container_of(dev, struct rbd_device, dev);
3968}
3969
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003970static ssize_t rbd_size_show(struct device *dev,
3971 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003972{
Alex Elder593a9e72012-02-07 12:03:37 -06003973 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003974
Alex Elderfc71d832013-04-26 15:44:36 -05003975 return sprintf(buf, "%llu\n",
3976 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003977}
3978
Alex Elder34b13182012-07-13 20:35:12 -05003979/*
3980 * Note this shows the features for whatever's mapped, which is not
3981 * necessarily the base image.
3982 */
3983static ssize_t rbd_features_show(struct device *dev,
3984 struct device_attribute *attr, char *buf)
3985{
3986 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3987
3988 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003989 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05003990}
3991
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003992static ssize_t rbd_major_show(struct device *dev,
3993 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003994{
Alex Elder593a9e72012-02-07 12:03:37 -06003995 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003996
Alex Elderfc71d832013-04-26 15:44:36 -05003997 if (rbd_dev->major)
3998 return sprintf(buf, "%d\n", rbd_dev->major);
3999
4000 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004001}
Alex Elderfc71d832013-04-26 15:44:36 -05004002
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004003static ssize_t rbd_minor_show(struct device *dev,
4004 struct device_attribute *attr, char *buf)
4005{
4006 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4007
4008 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004009}
4010
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004011static ssize_t rbd_client_addr_show(struct device *dev,
4012 struct device_attribute *attr, char *buf)
4013{
4014 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4015 struct ceph_entity_addr *client_addr =
4016 ceph_client_addr(rbd_dev->rbd_client->client);
4017
4018 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4019 le32_to_cpu(client_addr->nonce));
4020}
4021
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004022static ssize_t rbd_client_id_show(struct device *dev,
4023 struct device_attribute *attr, char *buf)
4024{
Alex Elder593a9e72012-02-07 12:03:37 -06004025 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004026
Alex Elder1dbb4392012-01-24 10:08:37 -06004027 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004028 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004029}
4030
Mike Christie267fb902016-08-18 18:38:43 +02004031static ssize_t rbd_cluster_fsid_show(struct device *dev,
4032 struct device_attribute *attr, char *buf)
4033{
4034 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4035
4036 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4037}
4038
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004039static ssize_t rbd_config_info_show(struct device *dev,
4040 struct device_attribute *attr, char *buf)
4041{
4042 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4043
4044 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004045}
4046
4047static ssize_t rbd_pool_show(struct device *dev,
4048 struct device_attribute *attr, char *buf)
4049{
Alex Elder593a9e72012-02-07 12:03:37 -06004050 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004051
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004052 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004053}
4054
Alex Elder9bb2f332012-07-12 10:46:35 -05004055static ssize_t rbd_pool_id_show(struct device *dev,
4056 struct device_attribute *attr, char *buf)
4057{
4058 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4059
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004060 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004061 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004062}
4063
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004064static ssize_t rbd_name_show(struct device *dev,
4065 struct device_attribute *attr, char *buf)
4066{
Alex Elder593a9e72012-02-07 12:03:37 -06004067 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004068
Alex Eldera92ffdf2012-10-30 19:40:33 -05004069 if (rbd_dev->spec->image_name)
4070 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4071
4072 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004073}
4074
Alex Elder589d30e2012-07-10 20:30:11 -05004075static ssize_t rbd_image_id_show(struct device *dev,
4076 struct device_attribute *attr, char *buf)
4077{
4078 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4079
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004080 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004081}
4082
Alex Elder34b13182012-07-13 20:35:12 -05004083/*
4084 * Shows the name of the currently-mapped snapshot (or
4085 * RBD_SNAP_HEAD_NAME for the base image).
4086 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004087static ssize_t rbd_snap_show(struct device *dev,
4088 struct device_attribute *attr,
4089 char *buf)
4090{
Alex Elder593a9e72012-02-07 12:03:37 -06004091 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004092
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004093 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004094}
4095
Mike Christie92a58672016-08-18 18:38:44 +02004096static ssize_t rbd_snap_id_show(struct device *dev,
4097 struct device_attribute *attr, char *buf)
4098{
4099 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4100
4101 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4102}
4103
Alex Elder86b00e02012-10-25 23:34:42 -05004104/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004105 * For a v2 image, shows the chain of parent images, separated by empty
4106 * lines. For v1 images or if there is no parent, shows "(no parent
4107 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004108 */
4109static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004110 struct device_attribute *attr,
4111 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004112{
4113 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004114 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004115
Ilya Dryomovff961282014-07-22 21:53:07 +04004116 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004117 return sprintf(buf, "(no parent image)\n");
4118
Ilya Dryomovff961282014-07-22 21:53:07 +04004119 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4120 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004121
Ilya Dryomovff961282014-07-22 21:53:07 +04004122 count += sprintf(&buf[count], "%s"
4123 "pool_id %llu\npool_name %s\n"
4124 "image_id %s\nimage_name %s\n"
4125 "snap_id %llu\nsnap_name %s\n"
4126 "overlap %llu\n",
4127 !count ? "" : "\n", /* first? */
4128 spec->pool_id, spec->pool_name,
4129 spec->image_id, spec->image_name ?: "(unknown)",
4130 spec->snap_id, spec->snap_name,
4131 rbd_dev->parent_overlap);
4132 }
Alex Elder86b00e02012-10-25 23:34:42 -05004133
Ilya Dryomovff961282014-07-22 21:53:07 +04004134 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004135}
4136
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004137static ssize_t rbd_image_refresh(struct device *dev,
4138 struct device_attribute *attr,
4139 const char *buf,
4140 size_t size)
4141{
Alex Elder593a9e72012-02-07 12:03:37 -06004142 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004143 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004144
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004145 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004146 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004147 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004148
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004149 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004150}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004151
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004152static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05004153static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004154static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004155static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004156static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004157static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
Mike Christie267fb902016-08-18 18:38:43 +02004158static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004159static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004160static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05004161static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004162static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05004163static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004164static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4165static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Mike Christie92a58672016-08-18 18:38:44 +02004166static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05004167static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004168
4169static struct attribute *rbd_attrs[] = {
4170 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004171 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004172 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004173 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004174 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004175 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004176 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004177 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004178 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004179 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004180 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004181 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004182 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004183 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004184 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004185 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004186 NULL
4187};
4188
4189static struct attribute_group rbd_attr_group = {
4190 .attrs = rbd_attrs,
4191};
4192
4193static const struct attribute_group *rbd_attr_groups[] = {
4194 &rbd_attr_group,
4195 NULL
4196};
4197
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004198static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004199
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304200static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004201 .name = "rbd",
4202 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004203 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004204};
4205
Alex Elder8b8fb992012-10-26 17:25:24 -05004206static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4207{
4208 kref_get(&spec->kref);
4209
4210 return spec;
4211}
4212
4213static void rbd_spec_free(struct kref *kref);
4214static void rbd_spec_put(struct rbd_spec *spec)
4215{
4216 if (spec)
4217 kref_put(&spec->kref, rbd_spec_free);
4218}
4219
4220static struct rbd_spec *rbd_spec_alloc(void)
4221{
4222 struct rbd_spec *spec;
4223
4224 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4225 if (!spec)
4226 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004227
4228 spec->pool_id = CEPH_NOPOOL;
4229 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004230 kref_init(&spec->kref);
4231
Alex Elder8b8fb992012-10-26 17:25:24 -05004232 return spec;
4233}
4234
4235static void rbd_spec_free(struct kref *kref)
4236{
4237 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4238
4239 kfree(spec->pool_name);
4240 kfree(spec->image_id);
4241 kfree(spec->image_name);
4242 kfree(spec->snap_name);
4243 kfree(spec);
4244}
4245
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004246static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004247{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004248 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004249 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004250
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004251 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004252 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004253 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004254
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004255 rbd_put_client(rbd_dev->rbd_client);
4256 rbd_spec_put(rbd_dev->spec);
4257 kfree(rbd_dev->opts);
4258 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004259}
4260
4261static void rbd_dev_release(struct device *dev)
4262{
4263 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4264 bool need_put = !!rbd_dev->opts;
4265
4266 if (need_put) {
4267 destroy_workqueue(rbd_dev->task_wq);
4268 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4269 }
4270
4271 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004272
4273 /*
4274 * This is racy, but way better than putting module outside of
4275 * the release callback. The race window is pretty small, so
4276 * doing something similar to dm (dm-builtin.c) is overkill.
4277 */
4278 if (need_put)
4279 module_put(THIS_MODULE);
4280}
4281
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004282static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4283 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004284{
4285 struct rbd_device *rbd_dev;
4286
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004287 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004288 if (!rbd_dev)
4289 return NULL;
4290
4291 spin_lock_init(&rbd_dev->lock);
4292 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004293 init_rwsem(&rbd_dev->header_rwsem);
4294
Ilya Dryomov7e973322017-01-25 18:16:22 +01004295 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004296 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004297 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004298
Ilya Dryomov99d16942016-08-12 16:11:41 +02004299 mutex_init(&rbd_dev->watch_mutex);
4300 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4301 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4302
Ilya Dryomoved95b212016-08-12 16:40:02 +02004303 init_rwsem(&rbd_dev->lock_rwsem);
4304 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4305 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4306 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4307 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4308 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4309 init_waitqueue_head(&rbd_dev->lock_waitq);
4310
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004311 rbd_dev->dev.bus = &rbd_bus_type;
4312 rbd_dev->dev.type = &rbd_device_type;
4313 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004314 device_initialize(&rbd_dev->dev);
4315
Alex Elderc53d5892012-10-25 23:34:42 -05004316 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004317 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004318
Alex Elderc53d5892012-10-25 23:34:42 -05004319 return rbd_dev;
4320}
4321
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004322/*
4323 * Create a mapping rbd_dev.
4324 */
4325static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4326 struct rbd_spec *spec,
4327 struct rbd_options *opts)
4328{
4329 struct rbd_device *rbd_dev;
4330
4331 rbd_dev = __rbd_dev_create(rbdc, spec);
4332 if (!rbd_dev)
4333 return NULL;
4334
4335 rbd_dev->opts = opts;
4336
4337 /* get an id and fill in device name */
4338 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4339 minor_to_rbd_dev_id(1 << MINORBITS),
4340 GFP_KERNEL);
4341 if (rbd_dev->dev_id < 0)
4342 goto fail_rbd_dev;
4343
4344 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4345 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4346 rbd_dev->name);
4347 if (!rbd_dev->task_wq)
4348 goto fail_dev_id;
4349
4350 /* we have a ref from do_rbd_add() */
4351 __module_get(THIS_MODULE);
4352
4353 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4354 return rbd_dev;
4355
4356fail_dev_id:
4357 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4358fail_rbd_dev:
4359 rbd_dev_free(rbd_dev);
4360 return NULL;
4361}
4362
Alex Elderc53d5892012-10-25 23:34:42 -05004363static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4364{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004365 if (rbd_dev)
4366 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004367}
4368
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004369/*
Alex Elder9d475de2012-07-03 16:01:19 -05004370 * Get the size and object order for an image snapshot, or if
4371 * snap_id is CEPH_NOSNAP, gets this information for the base
4372 * image.
4373 */
4374static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4375 u8 *order, u64 *snap_size)
4376{
4377 __le64 snapid = cpu_to_le64(snap_id);
4378 int ret;
4379 struct {
4380 u8 order;
4381 __le64 size;
4382 } __attribute__ ((packed)) size_buf = { 0 };
4383
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004384 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4385 &rbd_dev->header_oloc, "get_size",
4386 &snapid, sizeof(snapid),
4387 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004388 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004389 if (ret < 0)
4390 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004391 if (ret < sizeof (size_buf))
4392 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004393
Josh Durginc3545572013-08-28 17:08:10 -07004394 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004395 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004396 dout(" order %u", (unsigned int)*order);
4397 }
Alex Elder9d475de2012-07-03 16:01:19 -05004398 *snap_size = le64_to_cpu(size_buf.size);
4399
Josh Durginc3545572013-08-28 17:08:10 -07004400 dout(" snap_id 0x%016llx snap_size = %llu\n",
4401 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004402 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004403
4404 return 0;
4405}
4406
4407static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4408{
4409 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4410 &rbd_dev->header.obj_order,
4411 &rbd_dev->header.image_size);
4412}
4413
Alex Elder1e130192012-07-03 16:01:19 -05004414static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4415{
4416 void *reply_buf;
4417 int ret;
4418 void *p;
4419
4420 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4421 if (!reply_buf)
4422 return -ENOMEM;
4423
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004424 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4425 &rbd_dev->header_oloc, "get_object_prefix",
4426 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004427 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004428 if (ret < 0)
4429 goto out;
4430
4431 p = reply_buf;
4432 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004433 p + ret, NULL, GFP_NOIO);
4434 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004435
4436 if (IS_ERR(rbd_dev->header.object_prefix)) {
4437 ret = PTR_ERR(rbd_dev->header.object_prefix);
4438 rbd_dev->header.object_prefix = NULL;
4439 } else {
4440 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4441 }
Alex Elder1e130192012-07-03 16:01:19 -05004442out:
4443 kfree(reply_buf);
4444
4445 return ret;
4446}
4447
Alex Elderb1b54022012-07-03 16:01:19 -05004448static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4449 u64 *snap_features)
4450{
4451 __le64 snapid = cpu_to_le64(snap_id);
4452 struct {
4453 __le64 features;
4454 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004455 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004456 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004457 int ret;
4458
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004459 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4460 &rbd_dev->header_oloc, "get_features",
4461 &snapid, sizeof(snapid),
4462 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004463 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004464 if (ret < 0)
4465 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004466 if (ret < sizeof (features_buf))
4467 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004468
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004469 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4470 if (unsup) {
4471 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4472 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004473 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004474 }
Alex Elderd8891402012-10-09 13:50:17 -07004475
Alex Elderb1b54022012-07-03 16:01:19 -05004476 *snap_features = le64_to_cpu(features_buf.features);
4477
4478 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004479 (unsigned long long)snap_id,
4480 (unsigned long long)*snap_features,
4481 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004482
4483 return 0;
4484}
4485
4486static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4487{
4488 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4489 &rbd_dev->header.features);
4490}
4491
Alex Elder86b00e02012-10-25 23:34:42 -05004492static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4493{
4494 struct rbd_spec *parent_spec;
4495 size_t size;
4496 void *reply_buf = NULL;
4497 __le64 snapid;
4498 void *p;
4499 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05004500 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004501 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004502 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004503 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05004504 int ret;
4505
4506 parent_spec = rbd_spec_alloc();
4507 if (!parent_spec)
4508 return -ENOMEM;
4509
4510 size = sizeof (__le64) + /* pool_id */
4511 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
4512 sizeof (__le64) + /* snap_id */
4513 sizeof (__le64); /* overlap */
4514 reply_buf = kmalloc(size, GFP_KERNEL);
4515 if (!reply_buf) {
4516 ret = -ENOMEM;
4517 goto out_err;
4518 }
4519
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04004520 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004521 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4522 &rbd_dev->header_oloc, "get_parent",
4523 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004524 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05004525 if (ret < 0)
4526 goto out_err;
4527
Alex Elder86b00e02012-10-25 23:34:42 -05004528 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004529 end = reply_buf + ret;
4530 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05004531 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05004532 if (pool_id == CEPH_NOPOOL) {
4533 /*
4534 * Either the parent never existed, or we have
4535 * record of it but the image got flattened so it no
4536 * longer has a parent. When the parent of a
4537 * layered image disappears we immediately set the
4538 * overlap to 0. The effect of this is that all new
4539 * requests will be treated as if the image had no
4540 * parent.
4541 */
4542 if (rbd_dev->parent_overlap) {
4543 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004544 rbd_dev_parent_put(rbd_dev);
4545 pr_info("%s: clone image has been flattened\n",
4546 rbd_dev->disk->disk_name);
4547 }
4548
Alex Elder86b00e02012-10-25 23:34:42 -05004549 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004550 }
Alex Elder86b00e02012-10-25 23:34:42 -05004551
Alex Elder0903e872012-11-14 12:25:19 -06004552 /* The ceph file layout needs to fit pool id in 32 bits */
4553
4554 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05004555 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004556 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05004557 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004558 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004559 }
Alex Elder0903e872012-11-14 12:25:19 -06004560
Alex Elder979ed482012-11-01 08:39:26 -05004561 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05004562 if (IS_ERR(image_id)) {
4563 ret = PTR_ERR(image_id);
4564 goto out_err;
4565 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004566 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05004567 ceph_decode_64_safe(&p, end, overlap, out_err);
4568
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004569 /*
4570 * The parent won't change (except when the clone is
4571 * flattened, already handled that). So we only need to
4572 * record the parent spec we have not already done so.
4573 */
4574 if (!rbd_dev->parent_spec) {
4575 parent_spec->pool_id = pool_id;
4576 parent_spec->image_id = image_id;
4577 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05004578 rbd_dev->parent_spec = parent_spec;
4579 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04004580 } else {
4581 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004582 }
4583
4584 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004585 * We always update the parent overlap. If it's zero we issue
4586 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004587 */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004588 if (!overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004589 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004590 /* refresh, careful to warn just once */
4591 if (rbd_dev->parent_overlap)
4592 rbd_warn(rbd_dev,
4593 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004594 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004595 /* initial probe */
4596 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004597 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004598 }
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004599 rbd_dev->parent_overlap = overlap;
4600
Alex Elder86b00e02012-10-25 23:34:42 -05004601out:
4602 ret = 0;
4603out_err:
4604 kfree(reply_buf);
4605 rbd_spec_put(parent_spec);
4606
4607 return ret;
4608}
4609
Alex Eldercc070d52013-04-21 12:14:45 -05004610static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4611{
4612 struct {
4613 __le64 stripe_unit;
4614 __le64 stripe_count;
4615 } __attribute__ ((packed)) striping_info_buf = { 0 };
4616 size_t size = sizeof (striping_info_buf);
4617 void *p;
4618 u64 obj_size;
4619 u64 stripe_unit;
4620 u64 stripe_count;
4621 int ret;
4622
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004623 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4624 &rbd_dev->header_oloc, "get_stripe_unit_count",
4625 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004626 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4627 if (ret < 0)
4628 return ret;
4629 if (ret < size)
4630 return -ERANGE;
4631
4632 /*
4633 * We don't actually support the "fancy striping" feature
4634 * (STRIPINGV2) yet, but if the striping sizes are the
4635 * defaults the behavior is the same as before. So find
4636 * out, and only fail if the image has non-default values.
4637 */
4638 ret = -EINVAL;
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01004639 obj_size = rbd_obj_bytes(&rbd_dev->header);
Alex Eldercc070d52013-04-21 12:14:45 -05004640 p = &striping_info_buf;
4641 stripe_unit = ceph_decode_64(&p);
4642 if (stripe_unit != obj_size) {
4643 rbd_warn(rbd_dev, "unsupported stripe unit "
4644 "(got %llu want %llu)",
4645 stripe_unit, obj_size);
4646 return -EINVAL;
4647 }
4648 stripe_count = ceph_decode_64(&p);
4649 if (stripe_count != 1) {
4650 rbd_warn(rbd_dev, "unsupported stripe count "
4651 "(got %llu want 1)", stripe_count);
4652 return -EINVAL;
4653 }
Alex Elder500d0c02013-04-26 09:43:47 -05004654 rbd_dev->header.stripe_unit = stripe_unit;
4655 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05004656
4657 return 0;
4658}
4659
Ilya Dryomov7e973322017-01-25 18:16:22 +01004660static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4661{
4662 __le64 data_pool_id;
4663 int ret;
4664
4665 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4666 &rbd_dev->header_oloc, "get_data_pool",
4667 NULL, 0, &data_pool_id, sizeof(data_pool_id));
4668 if (ret < 0)
4669 return ret;
4670 if (ret < sizeof(data_pool_id))
4671 return -EBADMSG;
4672
4673 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4674 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4675 return 0;
4676}
4677
Alex Elder9e15b772012-10-30 19:40:33 -05004678static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4679{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004680 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05004681 size_t image_id_size;
4682 char *image_id;
4683 void *p;
4684 void *end;
4685 size_t size;
4686 void *reply_buf = NULL;
4687 size_t len = 0;
4688 char *image_name = NULL;
4689 int ret;
4690
4691 rbd_assert(!rbd_dev->spec->image_name);
4692
Alex Elder69e7a022012-11-01 08:39:26 -05004693 len = strlen(rbd_dev->spec->image_id);
4694 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004695 image_id = kmalloc(image_id_size, GFP_KERNEL);
4696 if (!image_id)
4697 return NULL;
4698
4699 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004700 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004701 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004702
4703 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4704 reply_buf = kmalloc(size, GFP_KERNEL);
4705 if (!reply_buf)
4706 goto out;
4707
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004708 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4709 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
4710 "dir_get_name", image_id, image_id_size,
4711 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05004712 if (ret < 0)
4713 goto out;
4714 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004715 end = reply_buf + ret;
4716
Alex Elder9e15b772012-10-30 19:40:33 -05004717 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4718 if (IS_ERR(image_name))
4719 image_name = NULL;
4720 else
4721 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4722out:
4723 kfree(reply_buf);
4724 kfree(image_id);
4725
4726 return image_name;
4727}
4728
Alex Elder2ad3d712013-04-30 00:44:33 -05004729static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4730{
4731 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4732 const char *snap_name;
4733 u32 which = 0;
4734
4735 /* Skip over names until we find the one we are looking for */
4736
4737 snap_name = rbd_dev->header.snap_names;
4738 while (which < snapc->num_snaps) {
4739 if (!strcmp(name, snap_name))
4740 return snapc->snaps[which];
4741 snap_name += strlen(snap_name) + 1;
4742 which++;
4743 }
4744 return CEPH_NOSNAP;
4745}
4746
4747static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4748{
4749 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4750 u32 which;
4751 bool found = false;
4752 u64 snap_id;
4753
4754 for (which = 0; !found && which < snapc->num_snaps; which++) {
4755 const char *snap_name;
4756
4757 snap_id = snapc->snaps[which];
4758 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07004759 if (IS_ERR(snap_name)) {
4760 /* ignore no-longer existing snapshots */
4761 if (PTR_ERR(snap_name) == -ENOENT)
4762 continue;
4763 else
4764 break;
4765 }
Alex Elder2ad3d712013-04-30 00:44:33 -05004766 found = !strcmp(name, snap_name);
4767 kfree(snap_name);
4768 }
4769 return found ? snap_id : CEPH_NOSNAP;
4770}
4771
4772/*
4773 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4774 * no snapshot by that name is found, or if an error occurs.
4775 */
4776static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4777{
4778 if (rbd_dev->image_format == 1)
4779 return rbd_v1_snap_id_by_name(rbd_dev, name);
4780
4781 return rbd_v2_snap_id_by_name(rbd_dev, name);
4782}
4783
Alex Elder9e15b772012-10-30 19:40:33 -05004784/*
Ilya Dryomov04077592014-07-23 17:11:20 +04004785 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05004786 */
Ilya Dryomov04077592014-07-23 17:11:20 +04004787static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
4788{
4789 struct rbd_spec *spec = rbd_dev->spec;
4790
4791 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
4792 rbd_assert(spec->image_id && spec->image_name);
4793 rbd_assert(spec->snap_name);
4794
4795 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4796 u64 snap_id;
4797
4798 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4799 if (snap_id == CEPH_NOSNAP)
4800 return -ENOENT;
4801
4802 spec->snap_id = snap_id;
4803 } else {
4804 spec->snap_id = CEPH_NOSNAP;
4805 }
4806
4807 return 0;
4808}
4809
4810/*
4811 * A parent image will have all ids but none of the names.
4812 *
4813 * All names in an rbd spec are dynamically allocated. It's OK if we
4814 * can't figure out the name for an image id.
4815 */
4816static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05004817{
Alex Elder2e9f7f12013-04-26 09:43:48 -05004818 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4819 struct rbd_spec *spec = rbd_dev->spec;
4820 const char *pool_name;
4821 const char *image_name;
4822 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004823 int ret;
4824
Ilya Dryomov04077592014-07-23 17:11:20 +04004825 rbd_assert(spec->pool_id != CEPH_NOPOOL);
4826 rbd_assert(spec->image_id);
4827 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05004828
Alex Elder2e9f7f12013-04-26 09:43:48 -05004829 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05004830
Alex Elder2e9f7f12013-04-26 09:43:48 -05004831 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4832 if (!pool_name) {
4833 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05004834 return -EIO;
4835 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05004836 pool_name = kstrdup(pool_name, GFP_KERNEL);
4837 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05004838 return -ENOMEM;
4839
4840 /* Fetch the image name; tolerate failure here */
4841
Alex Elder2e9f7f12013-04-26 09:43:48 -05004842 image_name = rbd_dev_image_name(rbd_dev);
4843 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05004844 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05004845
Ilya Dryomov04077592014-07-23 17:11:20 +04004846 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05004847
Alex Elder2e9f7f12013-04-26 09:43:48 -05004848 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07004849 if (IS_ERR(snap_name)) {
4850 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004851 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05004852 }
4853
4854 spec->pool_name = pool_name;
4855 spec->image_name = image_name;
4856 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004857
4858 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04004859
Alex Elder9e15b772012-10-30 19:40:33 -05004860out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05004861 kfree(image_name);
4862 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004863 return ret;
4864}
4865
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004866static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05004867{
4868 size_t size;
4869 int ret;
4870 void *reply_buf;
4871 void *p;
4872 void *end;
4873 u64 seq;
4874 u32 snap_count;
4875 struct ceph_snap_context *snapc;
4876 u32 i;
4877
4878 /*
4879 * We'll need room for the seq value (maximum snapshot id),
4880 * snapshot count, and array of that many snapshot ids.
4881 * For now we have a fixed upper limit on the number we're
4882 * prepared to receive.
4883 */
4884 size = sizeof (__le64) + sizeof (__le32) +
4885 RBD_MAX_SNAP_COUNT * sizeof (__le64);
4886 reply_buf = kzalloc(size, GFP_KERNEL);
4887 if (!reply_buf)
4888 return -ENOMEM;
4889
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004890 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4891 &rbd_dev->header_oloc, "get_snapcontext",
4892 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004893 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05004894 if (ret < 0)
4895 goto out;
4896
Alex Elder35d489f2012-07-03 16:01:19 -05004897 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004898 end = reply_buf + ret;
4899 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05004900 ceph_decode_64_safe(&p, end, seq, out);
4901 ceph_decode_32_safe(&p, end, snap_count, out);
4902
4903 /*
4904 * Make sure the reported number of snapshot ids wouldn't go
4905 * beyond the end of our buffer. But before checking that,
4906 * make sure the computed size of the snapshot context we
4907 * allocate is representable in a size_t.
4908 */
4909 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4910 / sizeof (u64)) {
4911 ret = -EINVAL;
4912 goto out;
4913 }
4914 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4915 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05004916 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05004917
Alex Elder812164f82013-04-30 00:44:32 -05004918 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05004919 if (!snapc) {
4920 ret = -ENOMEM;
4921 goto out;
4922 }
Alex Elder35d489f2012-07-03 16:01:19 -05004923 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05004924 for (i = 0; i < snap_count; i++)
4925 snapc->snaps[i] = ceph_decode_64(&p);
4926
Alex Elder49ece552013-05-06 08:37:00 -05004927 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05004928 rbd_dev->header.snapc = snapc;
4929
4930 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05004931 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05004932out:
4933 kfree(reply_buf);
4934
Alex Elder57385b52013-04-21 12:14:45 -05004935 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05004936}
4937
Alex Elder54cac612013-04-30 00:44:33 -05004938static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4939 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004940{
4941 size_t size;
4942 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05004943 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004944 int ret;
4945 void *p;
4946 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004947 char *snap_name;
4948
4949 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4950 reply_buf = kmalloc(size, GFP_KERNEL);
4951 if (!reply_buf)
4952 return ERR_PTR(-ENOMEM);
4953
Alex Elder54cac612013-04-30 00:44:33 -05004954 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004955 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4956 &rbd_dev->header_oloc, "get_snapshot_name",
4957 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004958 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05004959 if (ret < 0) {
4960 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004961 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05004962 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004963
4964 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004965 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05004966 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05004967 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004968 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004969
Alex Elderf40eb342013-04-25 15:09:42 -05004970 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05004971 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004972out:
4973 kfree(reply_buf);
4974
Alex Elderf40eb342013-04-25 15:09:42 -05004975 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004976}
4977
Alex Elder2df3fac2013-05-06 09:51:30 -05004978static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05004979{
Alex Elder2df3fac2013-05-06 09:51:30 -05004980 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05004981 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05004982
Josh Durgin1617e402013-06-12 14:43:10 -07004983 ret = rbd_dev_v2_image_size(rbd_dev);
4984 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05004985 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07004986
Alex Elder2df3fac2013-05-06 09:51:30 -05004987 if (first_time) {
4988 ret = rbd_dev_v2_header_onetime(rbd_dev);
4989 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05004990 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05004991 }
4992
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004993 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03004994 if (ret && first_time) {
4995 kfree(rbd_dev->header.object_prefix);
4996 rbd_dev->header.object_prefix = NULL;
4997 }
Alex Elder117973f2012-08-31 17:29:55 -05004998
4999 return ret;
5000}
5001
Ilya Dryomova720ae02014-07-23 17:11:19 +04005002static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5003{
5004 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5005
5006 if (rbd_dev->image_format == 1)
5007 return rbd_dev_v1_header_info(rbd_dev);
5008
5009 return rbd_dev_v2_header_info(rbd_dev);
5010}
5011
Alex Elder1ddbe942012-01-29 13:57:44 -06005012/*
Alex Eldere28fff262012-02-02 08:13:30 -06005013 * Skips over white space at *buf, and updates *buf to point to the
5014 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005015 * the token (string of non-white space characters) found. Note
5016 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005017 */
5018static inline size_t next_token(const char **buf)
5019{
5020 /*
5021 * These are the characters that produce nonzero for
5022 * isspace() in the "C" and "POSIX" locales.
5023 */
5024 const char *spaces = " \f\n\r\t\v";
5025
5026 *buf += strspn(*buf, spaces); /* Find start of token */
5027
5028 return strcspn(*buf, spaces); /* Return token length */
5029}
5030
5031/*
Alex Elderea3352f2012-07-09 21:04:23 -05005032 * Finds the next token in *buf, dynamically allocates a buffer big
5033 * enough to hold a copy of it, and copies the token into the new
5034 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5035 * that a duplicate buffer is created even for a zero-length token.
5036 *
5037 * Returns a pointer to the newly-allocated duplicate, or a null
5038 * pointer if memory for the duplicate was not available. If
5039 * the lenp argument is a non-null pointer, the length of the token
5040 * (not including the '\0') is returned in *lenp.
5041 *
5042 * If successful, the *buf pointer will be updated to point beyond
5043 * the end of the found token.
5044 *
5045 * Note: uses GFP_KERNEL for allocation.
5046 */
5047static inline char *dup_token(const char **buf, size_t *lenp)
5048{
5049 char *dup;
5050 size_t len;
5051
5052 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005053 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005054 if (!dup)
5055 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005056 *(dup + len) = '\0';
5057 *buf += len;
5058
5059 if (lenp)
5060 *lenp = len;
5061
5062 return dup;
5063}
5064
5065/*
Alex Elder859c31d2012-10-25 23:34:42 -05005066 * Parse the options provided for an "rbd add" (i.e., rbd image
5067 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5068 * and the data written is passed here via a NUL-terminated buffer.
5069 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005070 *
Alex Elder859c31d2012-10-25 23:34:42 -05005071 * The information extracted from these options is recorded in
5072 * the other parameters which return dynamically-allocated
5073 * structures:
5074 * ceph_opts
5075 * The address of a pointer that will refer to a ceph options
5076 * structure. Caller must release the returned pointer using
5077 * ceph_destroy_options() when it is no longer needed.
5078 * rbd_opts
5079 * Address of an rbd options pointer. Fully initialized by
5080 * this function; caller must release with kfree().
5081 * spec
5082 * Address of an rbd image specification pointer. Fully
5083 * initialized by this function based on parsed options.
5084 * Caller must release with rbd_spec_put().
5085 *
5086 * The options passed take this form:
5087 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5088 * where:
5089 * <mon_addrs>
5090 * A comma-separated list of one or more monitor addresses.
5091 * A monitor address is an ip address, optionally followed
5092 * by a port number (separated by a colon).
5093 * I.e.: ip1[:port1][,ip2[:port2]...]
5094 * <options>
5095 * A comma-separated list of ceph and/or rbd options.
5096 * <pool_name>
5097 * The name of the rados pool containing the rbd image.
5098 * <image_name>
5099 * The name of the image in that pool to map.
5100 * <snap_id>
5101 * An optional snapshot id. If provided, the mapping will
5102 * present data from the image at the time that snapshot was
5103 * created. The image head is used if no snapshot id is
5104 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005105 */
Alex Elder859c31d2012-10-25 23:34:42 -05005106static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005107 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005108 struct rbd_options **opts,
5109 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005110{
Alex Elderd22f76e2012-07-12 10:46:35 -05005111 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005112 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005113 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005114 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005115 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05005116 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005117 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005118 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005119 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005120
5121 /* The first four tokens are required */
5122
Alex Elder7ef32142012-02-02 08:13:30 -06005123 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005124 if (!len) {
5125 rbd_warn(NULL, "no monitor address(es) provided");
5126 return -EINVAL;
5127 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005128 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005129 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005130 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005131
Alex Elderdc79b112012-10-25 23:34:41 -05005132 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005133 options = dup_token(&buf, NULL);
5134 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005135 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005136 if (!*options) {
5137 rbd_warn(NULL, "no options provided");
5138 goto out_err;
5139 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005140
Alex Elder859c31d2012-10-25 23:34:42 -05005141 spec = rbd_spec_alloc();
5142 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005143 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005144
5145 spec->pool_name = dup_token(&buf, NULL);
5146 if (!spec->pool_name)
5147 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005148 if (!*spec->pool_name) {
5149 rbd_warn(NULL, "no pool name provided");
5150 goto out_err;
5151 }
Alex Eldere28fff262012-02-02 08:13:30 -06005152
Alex Elder69e7a022012-11-01 08:39:26 -05005153 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05005154 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005155 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005156 if (!*spec->image_name) {
5157 rbd_warn(NULL, "no image name provided");
5158 goto out_err;
5159 }
Alex Eldere28fff262012-02-02 08:13:30 -06005160
Alex Elderf28e5652012-10-25 23:34:41 -05005161 /*
5162 * Snapshot name is optional; default is to use "-"
5163 * (indicating the head/no snapshot).
5164 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005165 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005166 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005167 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5168 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005169 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005170 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005171 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005172 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005173 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5174 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005175 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005176 *(snap_name + len) = '\0';
5177 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005178
Alex Elder0ddebc02012-10-25 23:34:41 -05005179 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005180
Alex Elder4e9afeb2012-10-25 23:34:41 -05005181 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
5182 if (!rbd_opts)
5183 goto out_mem;
5184
5185 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Ilya Dryomovb5584182015-06-23 16:21:19 +03005186 rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov80de1912016-09-20 14:23:17 +02005187 rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
Ilya Dryomove010dd02017-04-13 12:17:39 +02005188 rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005189
Alex Elder859c31d2012-10-25 23:34:42 -05005190 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05005191 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05005192 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05005193 if (IS_ERR(copts)) {
5194 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005195 goto out_err;
5196 }
Alex Elder859c31d2012-10-25 23:34:42 -05005197 kfree(options);
5198
5199 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005200 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05005201 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005202
Alex Elderdc79b112012-10-25 23:34:41 -05005203 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005204out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005205 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005206out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05005207 kfree(rbd_opts);
5208 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005209 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005210
Alex Elderdc79b112012-10-25 23:34:41 -05005211 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005212}
5213
Alex Elder589d30e2012-07-10 20:30:11 -05005214/*
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005215 * Return pool id (>= 0) or a negative error code.
5216 */
5217static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
5218{
Ilya Dryomova319bf52015-05-15 12:02:17 +03005219 struct ceph_options *opts = rbdc->client->options;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005220 u64 newest_epoch;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005221 int tries = 0;
5222 int ret;
5223
5224again:
5225 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
5226 if (ret == -ENOENT && tries++ < 1) {
Ilya Dryomovd0b19702016-04-28 16:07:27 +02005227 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
5228 &newest_epoch);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005229 if (ret < 0)
5230 return ret;
5231
5232 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
Ilya Dryomov7cca78c2016-04-28 16:07:28 +02005233 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005234 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
Ilya Dryomova319bf52015-05-15 12:02:17 +03005235 newest_epoch,
5236 opts->mount_timeout);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005237 goto again;
5238 } else {
5239 /* the osdmap we have is new enough */
5240 return -ENOENT;
5241 }
5242 }
5243
5244 return ret;
5245}
5246
Ilya Dryomove010dd02017-04-13 12:17:39 +02005247static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5248{
5249 down_write(&rbd_dev->lock_rwsem);
5250 if (__rbd_is_lock_owner(rbd_dev))
5251 rbd_unlock(rbd_dev);
5252 up_write(&rbd_dev->lock_rwsem);
5253}
5254
5255static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5256{
5257 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5258 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5259 return -EINVAL;
5260 }
5261
5262 /* FIXME: "rbd map --exclusive" should be in interruptible */
5263 down_read(&rbd_dev->lock_rwsem);
5264 rbd_wait_state_locked(rbd_dev);
5265 up_read(&rbd_dev->lock_rwsem);
5266 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
5267 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5268 return -EROFS;
5269 }
5270
5271 return 0;
5272}
5273
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005274/*
Alex Elder589d30e2012-07-10 20:30:11 -05005275 * An rbd format 2 image has a unique identifier, distinct from the
5276 * name given to it by the user. Internally, that identifier is
5277 * what's used to specify the names of objects related to the image.
5278 *
5279 * A special "rbd id" object is used to map an rbd image name to its
5280 * id. If that object doesn't exist, then there is no v2 rbd image
5281 * with the supplied name.
5282 *
5283 * This function will record the given rbd_dev's image_id field if
5284 * it can be determined, and in that case will return 0. If any
5285 * errors occur a negative errno will be returned and the rbd_dev's
5286 * image_id field will be unchanged (and should be NULL).
5287 */
5288static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5289{
5290 int ret;
5291 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005292 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005293 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005294 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005295
Alex Elder589d30e2012-07-10 20:30:11 -05005296 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005297 * When probing a parent image, the image id is already
5298 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005299 * need to fetch the image id again in this case. We
5300 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005301 */
Alex Elderc0fba362013-04-25 23:15:08 -05005302 if (rbd_dev->spec->image_id) {
5303 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5304
Alex Elder2c0d0a12012-10-30 19:40:33 -05005305 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005306 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005307
5308 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005309 * First, see if the format 2 image id file exists, and if
5310 * so, get the image's persistent id from it.
5311 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005312 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5313 rbd_dev->spec->image_name);
5314 if (ret)
5315 return ret;
5316
5317 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005318
5319 /* Response will be an encoded string, which includes a length */
5320
5321 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5322 response = kzalloc(size, GFP_NOIO);
5323 if (!response) {
5324 ret = -ENOMEM;
5325 goto out;
5326 }
5327
Alex Elderc0fba362013-04-25 23:15:08 -05005328 /* If it doesn't exist we'll assume it's a format 1 image */
5329
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005330 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5331 "get_id", NULL, 0,
5332 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005333 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005334 if (ret == -ENOENT) {
5335 image_id = kstrdup("", GFP_KERNEL);
5336 ret = image_id ? 0 : -ENOMEM;
5337 if (!ret)
5338 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005339 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005340 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005341
Alex Elderc0fba362013-04-25 23:15:08 -05005342 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005343 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005344 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005345 if (!ret)
5346 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005347 }
5348
5349 if (!ret) {
5350 rbd_dev->spec->image_id = image_id;
5351 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005352 }
5353out:
5354 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005355 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005356 return ret;
5357}
5358
Alex Elder3abef3b2013-05-13 20:35:37 -05005359/*
5360 * Undo whatever state changes are made by v1 or v2 header info
5361 * call.
5362 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005363static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5364{
5365 struct rbd_image_header *header;
5366
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005367 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005368
5369 /* Free dynamic fields from the header, then zero it out */
5370
5371 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005372 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005373 kfree(header->snap_sizes);
5374 kfree(header->snap_names);
5375 kfree(header->object_prefix);
5376 memset(header, 0, sizeof (*header));
5377}
5378
Alex Elder2df3fac2013-05-06 09:51:30 -05005379static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005380{
5381 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005382
Alex Elder1e130192012-07-03 16:01:19 -05005383 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005384 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005385 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005386
Alex Elder2df3fac2013-05-06 09:51:30 -05005387 /*
5388 * Get the and check features for the image. Currently the
5389 * features are assumed to never change.
5390 */
Alex Elderb1b54022012-07-03 16:01:19 -05005391 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005392 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005393 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005394
Alex Eldercc070d52013-04-21 12:14:45 -05005395 /* If the image supports fancy striping, get its parameters */
5396
5397 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5398 ret = rbd_dev_v2_striping_info(rbd_dev);
5399 if (ret < 0)
5400 goto out_err;
5401 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005402
Ilya Dryomov7e973322017-01-25 18:16:22 +01005403 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5404 ret = rbd_dev_v2_data_pool(rbd_dev);
5405 if (ret)
5406 goto out_err;
5407 }
5408
Ilya Dryomov263423f2017-01-25 18:16:22 +01005409 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005410 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005411
Alex Elder9d475de2012-07-03 16:01:19 -05005412out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005413 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005414 kfree(rbd_dev->header.object_prefix);
5415 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005416 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005417}
5418
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005419/*
5420 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5421 * rbd_dev_image_probe() recursion depth, which means it's also the
5422 * length of the already discovered part of the parent chain.
5423 */
5424static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005425{
Alex Elder2f82ee52012-10-30 19:40:33 -05005426 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005427 int ret;
5428
5429 if (!rbd_dev->parent_spec)
5430 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005431
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005432 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5433 pr_info("parent chain is too long (%d)\n", depth);
5434 ret = -EINVAL;
5435 goto out_err;
5436 }
5437
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005438 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005439 if (!parent) {
5440 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005441 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005442 }
5443
5444 /*
5445 * Images related by parent/child relationships always share
5446 * rbd_client and spec/parent_spec, so bump their refcounts.
5447 */
5448 __rbd_get_client(rbd_dev->rbd_client);
5449 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005450
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005451 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005452 if (ret < 0)
5453 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005454
Alex Elder124afba2013-04-26 15:44:36 -05005455 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005456 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005457 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005458
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005459out_err:
5460 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005461 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005462 return ret;
5463}
5464
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005465static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5466{
5467 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5468 rbd_dev_mapping_clear(rbd_dev);
5469 rbd_free_disk(rbd_dev);
5470 if (!single_major)
5471 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5472}
5473
Ilya Dryomov811c6682016-04-15 16:22:16 +02005474/*
5475 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5476 * upon return.
5477 */
Alex Elder200a6a82013-04-28 23:32:34 -05005478static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005479{
Alex Elder83a06262012-10-30 15:47:17 -05005480 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005481
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005482 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005483
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005484 if (!single_major) {
5485 ret = register_blkdev(0, rbd_dev->name);
5486 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005487 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005488
5489 rbd_dev->major = ret;
5490 rbd_dev->minor = 0;
5491 } else {
5492 rbd_dev->major = rbd_major;
5493 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5494 }
Alex Elder83a06262012-10-30 15:47:17 -05005495
5496 /* Set up the blkdev mapping. */
5497
5498 ret = rbd_init_disk(rbd_dev);
5499 if (ret)
5500 goto err_out_blkdev;
5501
Alex Elderf35a4de2013-05-06 09:51:29 -05005502 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005503 if (ret)
5504 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005505
Alex Elderf35a4de2013-05-06 09:51:29 -05005506 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005507 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005508
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005509 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005510 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005511 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005512
Alex Elder129b79d2013-04-26 15:44:36 -05005513 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005514 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005515 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005516
Alex Elderf35a4de2013-05-06 09:51:29 -05005517err_out_mapping:
5518 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005519err_out_disk:
5520 rbd_free_disk(rbd_dev);
5521err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005522 if (!single_major)
5523 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005524err_out_unlock:
5525 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005526 return ret;
5527}
5528
Alex Elder332bb122013-04-27 09:59:30 -05005529static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5530{
5531 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005532 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005533
5534 /* Record the header object name for this rbd image. */
5535
5536 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005537 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005538 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5539 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005540 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005541 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5542 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005543
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005544 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005545}
5546
Alex Elder200a6a82013-04-28 23:32:34 -05005547static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5548{
Alex Elder6fd48b32013-04-28 23:32:34 -05005549 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005550 if (rbd_dev->opts)
5551 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005552 rbd_dev->image_format = 0;
5553 kfree(rbd_dev->spec->image_id);
5554 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005555}
5556
Alex Eldera30b71b2012-07-10 20:30:11 -05005557/*
5558 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005559 * device. If this image is the one being mapped (i.e., not a
5560 * parent), initiate a watch on its header object before using that
5561 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005562 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005563static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005564{
5565 int ret;
5566
5567 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005568 * Get the id from the image id object. Unless there's an
5569 * error, rbd_dev->spec->image_id will be filled in with
5570 * a dynamically-allocated string, and rbd_dev->image_format
5571 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005572 */
5573 ret = rbd_dev_image_id(rbd_dev);
5574 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005575 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005576
Alex Elder332bb122013-04-27 09:59:30 -05005577 ret = rbd_dev_header_name(rbd_dev);
5578 if (ret)
5579 goto err_out_format;
5580
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005581 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005582 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005583 if (ret) {
5584 if (ret == -ENOENT)
5585 pr_info("image %s/%s does not exist\n",
5586 rbd_dev->spec->pool_name,
5587 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005588 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005589 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005590 }
Alex Elderb644de22013-04-27 09:59:31 -05005591
Ilya Dryomova720ae02014-07-23 17:11:19 +04005592 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005593 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005594 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005595
Ilya Dryomov04077592014-07-23 17:11:20 +04005596 /*
5597 * If this image is the one being mapped, we have pool name and
5598 * id, image name and id, and snap name - need to fill snap id.
5599 * Otherwise this is a parent image, identified by pool, image
5600 * and snap ids - need to fill in names for those ids.
5601 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005602 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005603 ret = rbd_spec_fill_snap_id(rbd_dev);
5604 else
5605 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005606 if (ret) {
5607 if (ret == -ENOENT)
5608 pr_info("snap %s/%s@%s does not exist\n",
5609 rbd_dev->spec->pool_name,
5610 rbd_dev->spec->image_name,
5611 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005612 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005613 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005614
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005615 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5616 ret = rbd_dev_v2_parent_info(rbd_dev);
5617 if (ret)
5618 goto err_out_probe;
5619
5620 /*
5621 * Need to warn users if this image is the one being
5622 * mapped and has a parent.
5623 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005624 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005625 rbd_warn(rbd_dev,
5626 "WARNING: kernel layering is EXPERIMENTAL!");
5627 }
5628
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005629 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005630 if (ret)
5631 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005632
Alex Elder30d60ba2013-05-06 09:51:30 -05005633 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005634 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005635 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005636
Alex Elder6fd48b32013-04-28 23:32:34 -05005637err_out_probe:
5638 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005639err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005640 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005641 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005642err_out_format:
5643 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005644 kfree(rbd_dev->spec->image_id);
5645 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005646 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005647}
5648
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005649static ssize_t do_rbd_add(struct bus_type *bus,
5650 const char *buf,
5651 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005652{
Alex Eldercb8627c2012-07-09 21:04:23 -05005653 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005654 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005655 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005656 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005657 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005658 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005659
5660 if (!try_module_get(THIS_MODULE))
5661 return -ENODEV;
5662
Alex Eldera725f65e2012-02-02 08:13:30 -06005663 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005664 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005665 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005666 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06005667
Alex Elder9d3997f2012-10-25 23:34:42 -05005668 rbdc = rbd_get_client(ceph_opts);
5669 if (IS_ERR(rbdc)) {
5670 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005671 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005672 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005673
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005674 /* pick the pool */
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005675 rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005676 if (rc < 0) {
5677 if (rc == -ENOENT)
5678 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005679 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005680 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05005681 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005682
Ilya Dryomovd1475432015-06-22 13:24:48 +03005683 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005684 if (!rbd_dev) {
5685 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05005686 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005687 }
Alex Elderc53d5892012-10-25 23:34:42 -05005688 rbdc = NULL; /* rbd_dev now owns this */
5689 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03005690 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005691
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005692 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5693 if (!rbd_dev->config_info) {
5694 rc = -ENOMEM;
5695 goto err_out_rbd_dev;
5696 }
5697
Ilya Dryomov811c6682016-04-15 16:22:16 +02005698 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005699 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005700 if (rc < 0) {
5701 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05005702 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005703 }
Alex Elder05fd6f62012-08-29 17:11:07 -05005704
Alex Elder7ce4eef2013-05-06 17:40:33 -05005705 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05005706 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02005707 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05005708
Alex Elderb536f692013-04-28 23:32:34 -05005709 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005710 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005711 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05005712
Ilya Dryomove010dd02017-04-13 12:17:39 +02005713 if (rbd_dev->opts->exclusive) {
5714 rc = rbd_add_acquire_lock(rbd_dev);
5715 if (rc)
5716 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05005717 }
5718
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005719 /* Everything's ready. Announce the disk to the world. */
5720
5721 rc = device_add(&rbd_dev->dev);
5722 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02005723 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005724
5725 add_disk(rbd_dev->disk);
5726 /* see rbd_init_disk() */
5727 blk_put_queue(rbd_dev->disk->queue);
5728
5729 spin_lock(&rbd_dev_list_lock);
5730 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5731 spin_unlock(&rbd_dev_list_lock);
5732
5733 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5734 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5735 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005736 rc = count;
5737out:
5738 module_put(THIS_MODULE);
5739 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05005740
Ilya Dryomove010dd02017-04-13 12:17:39 +02005741err_out_image_lock:
5742 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005743err_out_device_setup:
5744 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005745err_out_image_probe:
5746 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005747err_out_rbd_dev:
5748 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005749err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005750 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005751err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05005752 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03005753 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005754 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005755}
5756
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005757static ssize_t rbd_add(struct bus_type *bus,
5758 const char *buf,
5759 size_t count)
5760{
5761 if (single_major)
5762 return -EINVAL;
5763
5764 return do_rbd_add(bus, buf, count);
5765}
5766
5767static ssize_t rbd_add_single_major(struct bus_type *bus,
5768 const char *buf,
5769 size_t count)
5770{
5771 return do_rbd_add(bus, buf, count);
5772}
5773
Alex Elder05a46af2013-04-26 15:44:36 -05005774static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5775{
Alex Elderad945fc2013-04-26 15:44:36 -05005776 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005777 struct rbd_device *first = rbd_dev;
5778 struct rbd_device *second = first->parent;
5779 struct rbd_device *third;
5780
5781 /*
5782 * Follow to the parent with no grandparent and
5783 * remove it.
5784 */
5785 while (second && (third = second->parent)) {
5786 first = second;
5787 second = third;
5788 }
Alex Elderad945fc2013-04-26 15:44:36 -05005789 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005790 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005791 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005792 first->parent = NULL;
5793 first->parent_overlap = 0;
5794
5795 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005796 rbd_spec_put(first->parent_spec);
5797 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005798 }
5799}
5800
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005801static ssize_t do_rbd_remove(struct bus_type *bus,
5802 const char *buf,
5803 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005804{
5805 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05005806 struct list_head *tmp;
5807 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02005808 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05005809 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02005810 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05005811 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005812
Mike Christie0276dca2016-08-18 18:38:45 +02005813 dev_id = -1;
5814 opt_buf[0] = '\0';
5815 sscanf(buf, "%d %5s", &dev_id, opt_buf);
5816 if (dev_id < 0) {
5817 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005818 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02005819 }
5820 if (opt_buf[0] != '\0') {
5821 if (!strcmp(opt_buf, "force")) {
5822 force = true;
5823 } else {
5824 pr_err("bad remove option at '%s'\n", opt_buf);
5825 return -EINVAL;
5826 }
5827 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005828
Alex Elder751cc0e2013-05-31 15:17:01 -05005829 ret = -ENOENT;
5830 spin_lock(&rbd_dev_list_lock);
5831 list_for_each(tmp, &rbd_dev_list) {
5832 rbd_dev = list_entry(tmp, struct rbd_device, node);
5833 if (rbd_dev->dev_id == dev_id) {
5834 ret = 0;
5835 break;
5836 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005837 }
Alex Elder751cc0e2013-05-31 15:17:01 -05005838 if (!ret) {
5839 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02005840 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05005841 ret = -EBUSY;
5842 else
Alex Elder82a442d2013-05-31 17:40:44 -05005843 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5844 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05005845 spin_unlock_irq(&rbd_dev->lock);
5846 }
5847 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05005848 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005849 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05005850
Mike Christie0276dca2016-08-18 18:38:45 +02005851 if (force) {
5852 /*
5853 * Prevent new IO from being queued and wait for existing
5854 * IO to complete/fail.
5855 */
5856 blk_mq_freeze_queue(rbd_dev->disk->queue);
5857 blk_set_queue_dying(rbd_dev->disk->queue);
5858 }
5859
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005860 del_gendisk(rbd_dev->disk);
5861 spin_lock(&rbd_dev_list_lock);
5862 list_del_init(&rbd_dev->node);
5863 spin_unlock(&rbd_dev_list_lock);
5864 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02005865
Ilya Dryomove010dd02017-04-13 12:17:39 +02005866 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005867 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005868 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005869 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005870 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005871}
5872
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005873static ssize_t rbd_remove(struct bus_type *bus,
5874 const char *buf,
5875 size_t count)
5876{
5877 if (single_major)
5878 return -EINVAL;
5879
5880 return do_rbd_remove(bus, buf, count);
5881}
5882
5883static ssize_t rbd_remove_single_major(struct bus_type *bus,
5884 const char *buf,
5885 size_t count)
5886{
5887 return do_rbd_remove(bus, buf, count);
5888}
5889
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005890/*
5891 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005892 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005893 */
5894static int rbd_sysfs_init(void)
5895{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005896 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005897
Alex Elderfed4c142012-02-07 12:03:36 -06005898 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005899 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005900 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005901
Alex Elderfed4c142012-02-07 12:03:36 -06005902 ret = bus_register(&rbd_bus_type);
5903 if (ret < 0)
5904 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005905
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005906 return ret;
5907}
5908
5909static void rbd_sysfs_cleanup(void)
5910{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005911 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005912 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005913}
5914
Alex Elder1c2a9df2013-05-01 12:43:03 -05005915static int rbd_slab_init(void)
5916{
5917 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005918 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05005919 if (!rbd_img_request_cache)
5920 return -ENOMEM;
5921
5922 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005923 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05005924 if (!rbd_obj_request_cache)
5925 goto out_err;
5926
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005927 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005928
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005929out_err:
Alex Elder868311b2013-05-01 12:43:03 -05005930 kmem_cache_destroy(rbd_img_request_cache);
5931 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005932 return -ENOMEM;
5933}
5934
5935static void rbd_slab_exit(void)
5936{
Alex Elder868311b2013-05-01 12:43:03 -05005937 rbd_assert(rbd_obj_request_cache);
5938 kmem_cache_destroy(rbd_obj_request_cache);
5939 rbd_obj_request_cache = NULL;
5940
Alex Elder1c2a9df2013-05-01 12:43:03 -05005941 rbd_assert(rbd_img_request_cache);
5942 kmem_cache_destroy(rbd_img_request_cache);
5943 rbd_img_request_cache = NULL;
5944}
5945
Alex Eldercc344fa2013-02-19 12:25:56 -06005946static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005947{
5948 int rc;
5949
Alex Elder1e32d342013-01-30 11:13:33 -06005950 if (!libceph_compatible(NULL)) {
5951 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06005952 return -EINVAL;
5953 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005954
Alex Elder1c2a9df2013-05-01 12:43:03 -05005955 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005956 if (rc)
5957 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005958
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005959 /*
5960 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03005961 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005962 */
5963 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
5964 if (!rbd_wq) {
5965 rc = -ENOMEM;
5966 goto err_out_slab;
5967 }
5968
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005969 if (single_major) {
5970 rbd_major = register_blkdev(0, RBD_DRV_NAME);
5971 if (rbd_major < 0) {
5972 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005973 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005974 }
5975 }
5976
Alex Elder1c2a9df2013-05-01 12:43:03 -05005977 rc = rbd_sysfs_init();
5978 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005979 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005980
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005981 if (single_major)
5982 pr_info("loaded (major %d)\n", rbd_major);
5983 else
5984 pr_info("loaded\n");
5985
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005986 return 0;
5987
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005988err_out_blkdev:
5989 if (single_major)
5990 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005991err_out_wq:
5992 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005993err_out_slab:
5994 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05005995 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005996}
5997
Alex Eldercc344fa2013-02-19 12:25:56 -06005998static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005999{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006000 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006001 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006002 if (single_major)
6003 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006004 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006005 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006006}
6007
6008module_init(rbd_init);
6009module_exit(rbd_exit);
6010
Alex Elderd552c612013-05-31 20:13:09 -05006011MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006012MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6013MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006014/* following authorship retained from original osdblk.c */
6015MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6016
Ilya Dryomov90da2582013-12-13 15:28:56 +02006017MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006018MODULE_LICENSE("GPL");