blob: aab513f1fb0050e6ebe7c061cc289814153bb574 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
55 * The basic unit of block I/O is a sector. It is interpreted in a
56 * number of contexts in Linux (blk, bio, genhd), but the default is
57 * universally 512 bytes. These symbols are just slightly more
58 * meaningful than the bare numbers they represent.
59 */
60#define SECTOR_SHIFT 9
61#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
62
Alex Eldera2acd002013-05-08 22:50:04 -050063/*
64 * Increment the given counter and return its updated value.
65 * If the counter is already 0 it will not be incremented.
66 * If the counter is already at its maximum value returns
67 * -EINVAL without updating it.
68 */
69static int atomic_inc_return_safe(atomic_t *v)
70{
71 unsigned int counter;
72
73 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
74 if (counter <= (unsigned int)INT_MAX)
75 return (int)counter;
76
77 atomic_dec(v);
78
79 return -EINVAL;
80}
81
82/* Decrement the counter. Return the resulting value, or -EINVAL */
83static int atomic_dec_return_safe(atomic_t *v)
84{
85 int counter;
86
87 counter = atomic_dec_return(v);
88 if (counter >= 0)
89 return counter;
90
91 atomic_inc(v);
92
93 return -EINVAL;
94}
95
Alex Elderf0f8cef2012-01-29 13:57:44 -060096#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
Ilya Dryomov7e513d42013-12-16 19:26:32 +020098#define RBD_MINORS_PER_MAJOR 256
99#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700100
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200101#define RBD_MAX_PARENT_CHAIN_LEN 16
102
Alex Elderd4b125e2012-07-03 16:01:19 -0500103#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
104#define RBD_MAX_SNAP_NAME_LEN \
105 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
106
Alex Elder35d489f2012-07-03 16:01:19 -0500107#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
109#define RBD_SNAP_HEAD_NAME "-"
110
Alex Elder9682fc62013-04-30 00:44:33 -0500111#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
112
Alex Elder9e15b772012-10-30 19:40:33 -0500113/* This allows a single page to hold an image name sent by OSD */
114#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500115#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500116
Alex Elder1e130192012-07-03 16:01:19 -0500117#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500118
Ilya Dryomoved95b212016-08-12 16:40:02 +0200119#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200120#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
121
Alex Elderd8891402012-10-09 13:50:17 -0700122/* Feature bits */
123
Ilya Dryomov8767b292017-03-02 19:56:57 +0100124#define RBD_FEATURE_LAYERING (1ULL<<0)
125#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
126#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
127#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100128#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100129
Ilya Dryomoved95b212016-08-12 16:40:02 +0200130#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
131 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100132 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100133 RBD_FEATURE_DATA_POOL | \
134 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700135
136/* Features supported by this (client software) implementation. */
137
Alex Elder770eba62012-10-25 23:34:40 -0500138#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700139
Alex Elder81a89792012-02-02 08:13:30 -0600140/*
141 * An RBD device name will be "rbd#", where the "rbd" comes from
142 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600143 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700144#define DEV_NAME_LEN 32
145
146/*
147 * block device image metadata (in-memory version)
148 */
149struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500150 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500151 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700152 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500153 u64 stripe_unit;
154 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100155 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500156 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157
Alex Elderf84344f2012-08-31 17:29:51 -0500158 /* The remaining fields need to be updated occasionally */
159 u64 image_size;
160 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500161 char *snap_names; /* format 1 only */
162 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700163};
164
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500165/*
166 * An rbd image specification.
167 *
168 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500169 * identify an image. Each rbd_dev structure includes a pointer to
170 * an rbd_spec structure that encapsulates this identity.
171 *
172 * Each of the id's in an rbd_spec has an associated name. For a
173 * user-mapped image, the names are supplied and the id's associated
174 * with them are looked up. For a layered image, a parent image is
175 * defined by the tuple, and the names are looked up.
176 *
177 * An rbd_dev structure contains a parent_spec pointer which is
178 * non-null if the image it represents is a child in a layered
179 * image. This pointer will refer to the rbd_spec structure used
180 * by the parent rbd_dev for its own identity (i.e., the structure
181 * is shared between the parent and child).
182 *
183 * Since these structures are populated once, during the discovery
184 * phase of image construction, they are effectively immutable so
185 * we make no effort to synchronize access to them.
186 *
187 * Note that code herein does not assume the image name is known (it
188 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500189 */
190struct rbd_spec {
191 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500192 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500193
Alex Elderecb4dc22013-04-26 09:43:47 -0500194 const char *image_id;
195 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500196
197 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500198 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500199
200 struct kref kref;
201};
202
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700203/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600204 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700205 */
206struct rbd_client {
207 struct ceph_client *client;
208 struct kref kref;
209 struct list_head node;
210};
211
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600213
Alex Elder9969ebc2013-01-18 12:31:10 -0600214enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100215 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100216 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100217 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100218 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600219};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600220
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800221enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100222 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800223 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800224 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800225};
226
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100227/*
228 * Writes go through the following state machine to deal with
229 * layering:
230 *
231 * need copyup
232 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
233 * | ^ |
234 * v \------------------------------/
235 * done
236 * ^
237 * |
238 * RBD_OBJ_WRITE_FLAT
239 *
240 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
241 * there is a parent or not.
242 */
243enum rbd_obj_write_state {
244 RBD_OBJ_WRITE_FLAT = 1,
245 RBD_OBJ_WRITE_GUARD,
246 RBD_OBJ_WRITE_COPYUP,
247};
248
Alex Elderbf0d5f502012-11-22 00:00:08 -0600249struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100250 struct ceph_object_extent ex;
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100251 union {
252 bool tried_parent; /* for reads */
253 enum rbd_obj_write_state write_state; /* for writes */
254 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255
Ilya Dryomov51c35092018-01-29 14:04:08 +0100256 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100257 struct ceph_file_extent *img_extents;
258 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600259
Alex Elder788e2df2013-01-17 12:25:27 -0600260 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100261 struct ceph_bio_iter bio_pos;
Alex Elder788e2df2013-01-17 12:25:27 -0600262 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100263 struct ceph_bvec_iter bvec_pos;
264 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100265 u32 bvec_idx;
Alex Elder788e2df2013-01-17 12:25:27 -0600266 };
267 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100268 struct bio_vec *copyup_bvecs;
269 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600270
271 struct ceph_osd_request *osd_req;
272
273 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800274 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600275
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276 struct kref kref;
277};
278
Alex Elder0c425242013-02-08 09:55:49 -0600279enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600280 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600281 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600282};
283
Alex Elderbf0d5f502012-11-22 00:00:08 -0600284struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600285 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100286 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100287 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600288 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600289 union {
Alex Elder9849e982013-01-24 16:13:36 -0600290 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600291 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600292 };
293 union {
294 struct request *rq; /* block request */
295 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600296 };
Ilya Dryomov15961b42018-02-01 11:50:47 +0100297 spinlock_t completion_lock;
Alex Elder55f27e02013-04-10 12:34:25 -0500298 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600299 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600300
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100301 struct list_head object_extents; /* obj_req.ex structs */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600302 u32 obj_request_count;
Ilya Dryomov7114eda2018-02-01 11:50:47 +0100303 u32 pending_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600304
305 struct kref kref;
306};
307
308#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100309 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600310#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100311 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600312
Ilya Dryomov99d16942016-08-12 16:11:41 +0200313enum rbd_watch_state {
314 RBD_WATCH_STATE_UNREGISTERED,
315 RBD_WATCH_STATE_REGISTERED,
316 RBD_WATCH_STATE_ERROR,
317};
318
Ilya Dryomoved95b212016-08-12 16:40:02 +0200319enum rbd_lock_state {
320 RBD_LOCK_STATE_UNLOCKED,
321 RBD_LOCK_STATE_LOCKED,
322 RBD_LOCK_STATE_RELEASING,
323};
324
325/* WatchNotify::ClientId */
326struct rbd_client_id {
327 u64 gid;
328 u64 handle;
329};
330
Alex Elderf84344f2012-08-31 17:29:51 -0500331struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500332 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500333 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500334};
335
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336/*
337 * a single device
338 */
339struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500340 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700341
342 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200343 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700345
Alex Eldera30b71b2012-07-10 20:30:11 -0500346 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347 struct rbd_client *rbd_client;
348
349 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
350
Alex Elderb82d1672013-01-14 12:43:31 -0600351 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700352
353 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600354 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500355 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300356 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200357 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200359 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200360 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500361
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200362 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600363
Ilya Dryomov99d16942016-08-12 16:11:41 +0200364 struct mutex watch_mutex;
365 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200366 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200367 u64 watch_cookie;
368 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700369
Ilya Dryomoved95b212016-08-12 16:40:02 +0200370 struct rw_semaphore lock_rwsem;
371 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200372 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200373 struct rbd_client_id owner_cid;
374 struct work_struct acquired_lock_work;
375 struct work_struct released_lock_work;
376 struct delayed_work lock_dwork;
377 struct work_struct unlock_work;
378 wait_queue_head_t lock_waitq;
379
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200380 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700381
Alex Elder86b00e02012-10-25 23:34:42 -0500382 struct rbd_spec *parent_spec;
383 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500384 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500385 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500386
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100387 /* Block layer tags. */
388 struct blk_mq_tag_set tag_set;
389
Josh Durginc6666012011-11-21 17:11:12 -0800390 /* protects updating the header */
391 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500392
393 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394
395 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800396
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800397 /* sysfs related */
398 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600399 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800400};
401
Alex Elderb82d1672013-01-14 12:43:31 -0600402/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200403 * Flag bits for rbd_dev->flags:
404 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
405 * by rbd_dev->lock
406 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600407 */
Alex Elder6d292902013-01-14 12:43:31 -0600408enum rbd_dev_flags {
409 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600410 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200411 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600412};
413
Alex Eldercfbf6372013-05-31 17:40:45 -0500414static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600415
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700416static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600417static DEFINE_SPINLOCK(rbd_dev_list_lock);
418
Alex Elder432b8582012-01-29 13:57:44 -0600419static LIST_HEAD(rbd_client_list); /* clients */
420static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421
Alex Elder78c2a442013-05-01 12:43:04 -0500422/* Slab caches for frequently-allocated structures */
423
Alex Elder1c2a9df2013-05-01 12:43:03 -0500424static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500425static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500426
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200427static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200428static DEFINE_IDA(rbd_dev_id_ida);
429
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400430static struct workqueue_struct *rbd_wq;
431
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200432/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100433 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200434 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100435static bool single_major = true;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200436module_param(single_major, bool, S_IRUGO);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100437MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200438
Alex Elderf0f8cef2012-01-29 13:57:44 -0600439static ssize_t rbd_add(struct bus_type *bus, const char *buf,
440 size_t count);
441static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
442 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200443static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
444 size_t count);
445static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
446 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200447static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600448
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200449static int rbd_dev_id_to_minor(int dev_id)
450{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200451 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200452}
453
454static int minor_to_rbd_dev_id(int minor)
455{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200456 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200457}
458
Ilya Dryomoved95b212016-08-12 16:40:02 +0200459static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
460{
461 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
462 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
463}
464
465static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
466{
467 bool is_lock_owner;
468
469 down_read(&rbd_dev->lock_rwsem);
470 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
471 up_read(&rbd_dev->lock_rwsem);
472 return is_lock_owner;
473}
474
Ilya Dryomov8767b292017-03-02 19:56:57 +0100475static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
476{
477 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
478}
479
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700480static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
481static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200482static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
483static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Ilya Dryomov8767b292017-03-02 19:56:57 +0100484static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700485
486static struct attribute *rbd_bus_attrs[] = {
487 &bus_attr_add.attr,
488 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200489 &bus_attr_add_single_major.attr,
490 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100491 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700492 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600493};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200494
495static umode_t rbd_bus_is_visible(struct kobject *kobj,
496 struct attribute *attr, int index)
497{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200498 if (!single_major &&
499 (attr == &bus_attr_add_single_major.attr ||
500 attr == &bus_attr_remove_single_major.attr))
501 return 0;
502
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200503 return attr->mode;
504}
505
506static const struct attribute_group rbd_bus_group = {
507 .attrs = rbd_bus_attrs,
508 .is_visible = rbd_bus_is_visible,
509};
510__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600511
512static struct bus_type rbd_bus_type = {
513 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700514 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600515};
516
517static void rbd_root_dev_release(struct device *dev)
518{
519}
520
521static struct device rbd_root_dev = {
522 .init_name = "rbd",
523 .release = rbd_root_dev_release,
524};
525
Alex Elder06ecc6c2012-11-01 10:17:15 -0500526static __printf(2, 3)
527void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
528{
529 struct va_format vaf;
530 va_list args;
531
532 va_start(args, fmt);
533 vaf.fmt = fmt;
534 vaf.va = &args;
535
536 if (!rbd_dev)
537 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
538 else if (rbd_dev->disk)
539 printk(KERN_WARNING "%s: %s: %pV\n",
540 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
541 else if (rbd_dev->spec && rbd_dev->spec->image_name)
542 printk(KERN_WARNING "%s: image %s: %pV\n",
543 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
544 else if (rbd_dev->spec && rbd_dev->spec->image_id)
545 printk(KERN_WARNING "%s: id %s: %pV\n",
546 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
547 else /* punt */
548 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
549 RBD_DRV_NAME, rbd_dev, &vaf);
550 va_end(args);
551}
552
Alex Elderaafb2302012-09-06 16:00:54 -0500553#ifdef RBD_DEBUG
554#define rbd_assert(expr) \
555 if (unlikely(!(expr))) { \
556 printk(KERN_ERR "\nAssertion failure in %s() " \
557 "at line %d:\n\n" \
558 "\trbd_assert(%s);\n\n", \
559 __func__, __LINE__, #expr); \
560 BUG(); \
561 }
562#else /* !RBD_DEBUG */
563# define rbd_assert(expr) ((void) 0)
564#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800565
Alex Elder05a46af2013-04-26 15:44:36 -0500566static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600567
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500568static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500569static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400570static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400571static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500572static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
573 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500574static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
575 u8 *order, u64 *snap_size);
576static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
577 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700578
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700579static int rbd_open(struct block_device *bdev, fmode_t mode)
580{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600581 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600582 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700583
Alex Eldera14ea262013-02-05 13:23:12 -0600584 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600585 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
586 removing = true;
587 else
588 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600589 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600590 if (removing)
591 return -ENOENT;
592
Alex Elderc3e946c2012-11-16 09:29:16 -0600593 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700594
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700595 return 0;
596}
597
Al Virodb2a1442013-05-05 21:52:57 -0400598static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800599{
600 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600601 unsigned long open_count_before;
602
Alex Eldera14ea262013-02-05 13:23:12 -0600603 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600604 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600605 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600606 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800607
Alex Elderc3e946c2012-11-16 09:29:16 -0600608 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800609}
610
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800611static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
612{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200613 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800614
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200615 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800616 return -EFAULT;
617
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200618 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800619 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
620 return -EROFS;
621
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200622 /* Let blkdev_roset() handle it */
623 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800624}
625
626static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
627 unsigned int cmd, unsigned long arg)
628{
629 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200630 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800631
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800632 switch (cmd) {
633 case BLKROSET:
634 ret = rbd_ioctl_set_ro(rbd_dev, arg);
635 break;
636 default:
637 ret = -ENOTTY;
638 }
639
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800640 return ret;
641}
642
643#ifdef CONFIG_COMPAT
644static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
645 unsigned int cmd, unsigned long arg)
646{
647 return rbd_ioctl(bdev, mode, cmd, arg);
648}
649#endif /* CONFIG_COMPAT */
650
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651static const struct block_device_operations rbd_bd_ops = {
652 .owner = THIS_MODULE,
653 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800654 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800655 .ioctl = rbd_ioctl,
656#ifdef CONFIG_COMPAT
657 .compat_ioctl = rbd_compat_ioctl,
658#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659};
660
661/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500662 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500663 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700664 */
Alex Elderf8c38922012-08-10 13:12:07 -0700665static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666{
667 struct rbd_client *rbdc;
668 int ret = -ENOMEM;
669
Alex Elder37206ee2013-02-20 17:32:08 -0600670 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
672 if (!rbdc)
673 goto out_opt;
674
675 kref_init(&rbdc->kref);
676 INIT_LIST_HEAD(&rbdc->node);
677
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100678 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500680 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500681 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682
683 ret = ceph_open_session(rbdc->client);
684 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500685 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686
Alex Elder432b8582012-01-29 13:57:44 -0600687 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700688 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600689 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690
Alex Elder37206ee2013-02-20 17:32:08 -0600691 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600692
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700693 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500694out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500696out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 kfree(rbdc);
698out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500699 if (ceph_opts)
700 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600701 dout("%s: error %d\n", __func__, ret);
702
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400703 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704}
705
Alex Elder2f82ee52012-10-30 19:40:33 -0500706static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
707{
708 kref_get(&rbdc->kref);
709
710 return rbdc;
711}
712
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700714 * Find a ceph client with specific addr and configuration. If
715 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700717static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700718{
719 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700720 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721
Alex Elder43ae4702012-07-03 16:01:18 -0500722 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723 return NULL;
724
Alex Elder1f7ba332012-08-10 13:12:07 -0700725 spin_lock(&rbd_client_list_lock);
726 list_for_each_entry(client_node, &rbd_client_list, node) {
727 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500728 __rbd_get_client(client_node);
729
Alex Elder1f7ba332012-08-10 13:12:07 -0700730 found = true;
731 break;
732 }
733 }
734 spin_unlock(&rbd_client_list_lock);
735
736 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700737}
738
739/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300740 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700741 */
742enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300743 Opt_queue_depth,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700744 Opt_last_int,
745 /* int args above */
746 Opt_last_string,
747 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700748 Opt_read_only,
749 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200750 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200751 Opt_exclusive,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300752 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700753};
754
Alex Elder43ae4702012-07-03 16:01:18 -0500755static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300756 {Opt_queue_depth, "queue_depth=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700757 /* int args above */
758 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500759 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700760 {Opt_read_only, "ro"}, /* Alternate spelling */
761 {Opt_read_write, "read_write"},
762 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200763 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200764 {Opt_exclusive, "exclusive"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300765 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700766};
767
Alex Elder98571b52013-01-20 14:44:42 -0600768struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300769 int queue_depth;
Alex Elder98571b52013-01-20 14:44:42 -0600770 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200771 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200772 bool exclusive;
Alex Elder98571b52013-01-20 14:44:42 -0600773};
774
Ilya Dryomovb5584182015-06-23 16:21:19 +0300775#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Alex Elder98571b52013-01-20 14:44:42 -0600776#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200777#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200778#define RBD_EXCLUSIVE_DEFAULT false
Alex Elder98571b52013-01-20 14:44:42 -0600779
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700780static int parse_rbd_opts_token(char *c, void *private)
781{
Alex Elder43ae4702012-07-03 16:01:18 -0500782 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700783 substring_t argstr[MAX_OPT_ARGS];
784 int token, intval, ret;
785
Alex Elder43ae4702012-07-03 16:01:18 -0500786 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700787 if (token < Opt_last_int) {
788 ret = match_int(&argstr[0], &intval);
789 if (ret < 0) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300790 pr_err("bad mount option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700791 return ret;
792 }
793 dout("got int token %d val %d\n", token, intval);
794 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300795 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700796 } else {
797 dout("got token %d\n", token);
798 }
799
800 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300801 case Opt_queue_depth:
802 if (intval < 1) {
803 pr_err("queue_depth out of range\n");
804 return -EINVAL;
805 }
806 rbd_opts->queue_depth = intval;
807 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700808 case Opt_read_only:
809 rbd_opts->read_only = true;
810 break;
811 case Opt_read_write:
812 rbd_opts->read_only = false;
813 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200814 case Opt_lock_on_read:
815 rbd_opts->lock_on_read = true;
816 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200817 case Opt_exclusive:
818 rbd_opts->exclusive = true;
819 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700820 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300821 /* libceph prints "bad option" msg */
822 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700823 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300824
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700825 return 0;
826}
827
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800828static char* obj_op_name(enum obj_operation_type op_type)
829{
830 switch (op_type) {
831 case OBJ_OP_READ:
832 return "read";
833 case OBJ_OP_WRITE:
834 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800835 case OBJ_OP_DISCARD:
836 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800837 default:
838 return "???";
839 }
840}
841
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700842/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700843 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600844 *
Alex Elder432b8582012-01-29 13:57:44 -0600845 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846 */
847static void rbd_client_release(struct kref *kref)
848{
849 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
850
Alex Elder37206ee2013-02-20 17:32:08 -0600851 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500852 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700853 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500854 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700855
856 ceph_destroy_client(rbdc->client);
857 kfree(rbdc);
858}
859
860/*
861 * Drop reference to ceph client node. If it's not referenced anymore, release
862 * it.
863 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500864static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865{
Alex Elderc53d5892012-10-25 23:34:42 -0500866 if (rbdc)
867 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700868}
869
Ilya Dryomovdd435852018-02-22 13:43:24 +0100870static int wait_for_latest_osdmap(struct ceph_client *client)
871{
872 u64 newest_epoch;
873 int ret;
874
875 ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
876 if (ret)
877 return ret;
878
879 if (client->osdc.osdmap->epoch >= newest_epoch)
880 return 0;
881
882 ceph_osdc_maybe_request_map(&client->osdc);
883 return ceph_monc_wait_osdmap(&client->monc, newest_epoch,
884 client->options->mount_timeout);
885}
886
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100887/*
888 * Get a ceph client with specific addr and configuration, if one does
889 * not exist create it. Either way, ceph_opts is consumed by this
890 * function.
891 */
892static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
893{
894 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100895 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100896
897 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
898 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100899 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100900 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100901
902 /*
903 * Using an existing client. Make sure ->pg_pools is up to
904 * date before we look up the pool id in do_rbd_add().
905 */
906 ret = wait_for_latest_osdmap(rbdc->client);
907 if (ret) {
908 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
909 rbd_put_client(rbdc);
910 rbdc = ERR_PTR(ret);
911 }
912 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100913 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100914 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100915 mutex_unlock(&client_mutex);
916
917 return rbdc;
918}
919
Alex Eldera30b71b2012-07-10 20:30:11 -0500920static bool rbd_image_format_valid(u32 image_format)
921{
922 return image_format == 1 || image_format == 2;
923}
924
Alex Elder8e94af82012-07-25 09:32:40 -0500925static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
926{
Alex Elder103a1502012-08-02 11:29:45 -0500927 size_t size;
928 u32 snap_count;
929
930 /* The header has to start with the magic rbd header text */
931 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
932 return false;
933
Alex Elderdb2388b2012-10-20 22:17:27 -0500934 /* The bio layer requires at least sector-sized I/O */
935
936 if (ondisk->options.order < SECTOR_SHIFT)
937 return false;
938
939 /* If we use u64 in a few spots we may be able to loosen this */
940
941 if (ondisk->options.order > 8 * sizeof (int) - 1)
942 return false;
943
Alex Elder103a1502012-08-02 11:29:45 -0500944 /*
945 * The size of a snapshot header has to fit in a size_t, and
946 * that limits the number of snapshots.
947 */
948 snap_count = le32_to_cpu(ondisk->snap_count);
949 size = SIZE_MAX - sizeof (struct ceph_snap_context);
950 if (snap_count > size / sizeof (__le64))
951 return false;
952
953 /*
954 * Not only that, but the size of the entire the snapshot
955 * header must also be representable in a size_t.
956 */
957 size -= snap_count * sizeof (__le64);
958 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
959 return false;
960
961 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500962}
963
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700964/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100965 * returns the size of an object in the image
966 */
967static u32 rbd_obj_bytes(struct rbd_image_header *header)
968{
969 return 1U << header->obj_order;
970}
971
Ilya Dryomov263423f2017-01-25 18:16:22 +0100972static void rbd_init_layout(struct rbd_device *rbd_dev)
973{
974 if (rbd_dev->header.stripe_unit == 0 ||
975 rbd_dev->header.stripe_count == 0) {
976 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
977 rbd_dev->header.stripe_count = 1;
978 }
979
980 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
981 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
982 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +0100983 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
984 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +0100985 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
986}
987
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100988/*
Alex Elderbb23e372013-05-06 09:51:29 -0500989 * Fill an rbd image header with information from the given format 1
990 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700991 */
Alex Elder662518b2013-05-06 09:51:29 -0500992static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -0500993 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700994{
Alex Elder662518b2013-05-06 09:51:29 -0500995 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -0500996 bool first_time = header->object_prefix == NULL;
997 struct ceph_snap_context *snapc;
998 char *object_prefix = NULL;
999 char *snap_names = NULL;
1000 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001001 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001002 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001003 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004
Alex Elderbb23e372013-05-06 09:51:29 -05001005 /* Allocate this now to avoid having to handle failure below */
1006
1007 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001008 object_prefix = kstrndup(ondisk->object_prefix,
1009 sizeof(ondisk->object_prefix),
1010 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001011 if (!object_prefix)
1012 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001013 }
1014
1015 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001016
Alex Elder103a1502012-08-02 11:29:45 -05001017 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001018 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1019 if (!snapc)
1020 goto out_err;
1021 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001022 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001023 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001024 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1025
Alex Elderbb23e372013-05-06 09:51:29 -05001026 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001027
Alex Elderbb23e372013-05-06 09:51:29 -05001028 if (snap_names_len > (u64)SIZE_MAX)
1029 goto out_2big;
1030 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1031 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001032 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001033
1034 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001035 snap_sizes = kmalloc_array(snap_count,
1036 sizeof(*header->snap_sizes),
1037 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001038 if (!snap_sizes)
1039 goto out_err;
1040
Alex Elderf785cc12012-08-23 23:22:06 -05001041 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001042 * Copy the names, and fill in each snapshot's id
1043 * and size.
1044 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001045 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001046 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001047 * snap_names_len bytes beyond the end of the
1048 * snapshot id array, this memcpy() is safe.
1049 */
Alex Elderbb23e372013-05-06 09:51:29 -05001050 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1051 snaps = ondisk->snaps;
1052 for (i = 0; i < snap_count; i++) {
1053 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1054 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1055 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056 }
Alex Elder849b4262012-07-09 21:04:24 -05001057
Alex Elderbb23e372013-05-06 09:51:29 -05001058 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001059
Alex Elderbb23e372013-05-06 09:51:29 -05001060 if (first_time) {
1061 header->object_prefix = object_prefix;
1062 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001063 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001064 } else {
1065 ceph_put_snap_context(header->snapc);
1066 kfree(header->snap_names);
1067 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001068 }
1069
1070 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001071
Alex Elderf84344f2012-08-31 17:29:51 -05001072 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001073 header->snapc = snapc;
1074 header->snap_names = snap_names;
1075 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001076
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001077 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001078out_2big:
1079 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001080out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001081 kfree(snap_sizes);
1082 kfree(snap_names);
1083 ceph_put_snap_context(snapc);
1084 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001085
Alex Elderbb23e372013-05-06 09:51:29 -05001086 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001087}
1088
Alex Elder9682fc62013-04-30 00:44:33 -05001089static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1090{
1091 const char *snap_name;
1092
1093 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1094
1095 /* Skip over names until we find the one we are looking for */
1096
1097 snap_name = rbd_dev->header.snap_names;
1098 while (which--)
1099 snap_name += strlen(snap_name) + 1;
1100
1101 return kstrdup(snap_name, GFP_KERNEL);
1102}
1103
Alex Elder30d1cff2013-05-01 12:43:03 -05001104/*
1105 * Snapshot id comparison function for use with qsort()/bsearch().
1106 * Note that result is for snapshots in *descending* order.
1107 */
1108static int snapid_compare_reverse(const void *s1, const void *s2)
1109{
1110 u64 snap_id1 = *(u64 *)s1;
1111 u64 snap_id2 = *(u64 *)s2;
1112
1113 if (snap_id1 < snap_id2)
1114 return 1;
1115 return snap_id1 == snap_id2 ? 0 : -1;
1116}
1117
1118/*
1119 * Search a snapshot context to see if the given snapshot id is
1120 * present.
1121 *
1122 * Returns the position of the snapshot id in the array if it's found,
1123 * or BAD_SNAP_INDEX otherwise.
1124 *
1125 * Note: The snapshot array is in kept sorted (by the osd) in
1126 * reverse order, highest snapshot id first.
1127 */
Alex Elder9682fc62013-04-30 00:44:33 -05001128static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1129{
1130 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001131 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001132
Alex Elder30d1cff2013-05-01 12:43:03 -05001133 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1134 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001135
Alex Elder30d1cff2013-05-01 12:43:03 -05001136 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001137}
1138
Alex Elder2ad3d712013-04-30 00:44:33 -05001139static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1140 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001141{
1142 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001143 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001144
1145 which = rbd_dev_snap_index(rbd_dev, snap_id);
1146 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001147 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001148
Josh Durginda6a6b62013-09-04 17:57:31 -07001149 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1150 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001151}
1152
Alex Elder9e15b772012-10-30 19:40:33 -05001153static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1154{
Alex Elder9e15b772012-10-30 19:40:33 -05001155 if (snap_id == CEPH_NOSNAP)
1156 return RBD_SNAP_HEAD_NAME;
1157
Alex Elder54cac612013-04-30 00:44:33 -05001158 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1159 if (rbd_dev->image_format == 1)
1160 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001161
Alex Elder54cac612013-04-30 00:44:33 -05001162 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001163}
1164
Alex Elder2ad3d712013-04-30 00:44:33 -05001165static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1166 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167{
Alex Elder2ad3d712013-04-30 00:44:33 -05001168 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1169 if (snap_id == CEPH_NOSNAP) {
1170 *snap_size = rbd_dev->header.image_size;
1171 } else if (rbd_dev->image_format == 1) {
1172 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001173
Alex Elder2ad3d712013-04-30 00:44:33 -05001174 which = rbd_dev_snap_index(rbd_dev, snap_id);
1175 if (which == BAD_SNAP_INDEX)
1176 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001177
Alex Elder2ad3d712013-04-30 00:44:33 -05001178 *snap_size = rbd_dev->header.snap_sizes[which];
1179 } else {
1180 u64 size = 0;
1181 int ret;
1182
1183 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1184 if (ret)
1185 return ret;
1186
1187 *snap_size = size;
1188 }
1189 return 0;
1190}
1191
1192static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1193 u64 *snap_features)
1194{
1195 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1196 if (snap_id == CEPH_NOSNAP) {
1197 *snap_features = rbd_dev->header.features;
1198 } else if (rbd_dev->image_format == 1) {
1199 *snap_features = 0; /* No features for format 1 */
1200 } else {
1201 u64 features = 0;
1202 int ret;
1203
1204 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1205 if (ret)
1206 return ret;
1207
1208 *snap_features = features;
1209 }
1210 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001211}
1212
Alex Elderd1cf5782013-04-27 09:59:30 -05001213static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001214{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001215 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001216 u64 size = 0;
1217 u64 features = 0;
1218 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001219
Alex Elder2ad3d712013-04-30 00:44:33 -05001220 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1221 if (ret)
1222 return ret;
1223 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1224 if (ret)
1225 return ret;
1226
1227 rbd_dev->mapping.size = size;
1228 rbd_dev->mapping.features = features;
1229
Alex Elder8b0241f2013-04-25 23:15:08 -05001230 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231}
1232
Alex Elderd1cf5782013-04-27 09:59:30 -05001233static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1234{
1235 rbd_dev->mapping.size = 0;
1236 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001237}
1238
Ilya Dryomov5359a172018-01-20 10:30:10 +01001239static void zero_bvec(struct bio_vec *bv)
1240{
1241 void *buf;
1242 unsigned long flags;
1243
1244 buf = bvec_kmap_irq(bv, &flags);
1245 memset(buf, 0, bv->bv_len);
1246 flush_dcache_page(bv->bv_page);
1247 bvec_kunmap_irq(buf, &flags);
1248}
1249
1250static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1251{
1252 struct ceph_bio_iter it = *bio_pos;
1253
1254 ceph_bio_iter_advance(&it, off);
1255 ceph_bio_iter_advance_step(&it, bytes, ({
1256 zero_bvec(&bv);
1257 }));
1258}
1259
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001260static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001261{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001262 struct ceph_bvec_iter it = *bvec_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001263
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001264 ceph_bvec_iter_advance(&it, off);
1265 ceph_bvec_iter_advance_step(&it, bytes, ({
1266 zero_bvec(&bv);
1267 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001268}
1269
1270/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001271 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001272 * (private) bio_vec array.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001273 *
1274 * @off is relative to the start of the data buffer.
1275 */
1276static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1277 u32 bytes)
1278{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001279 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001280 case OBJ_REQUEST_BIO:
1281 zero_bios(&obj_req->bio_pos, off, bytes);
1282 break;
1283 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001284 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001285 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1286 break;
1287 default:
1288 rbd_assert(0);
1289 }
1290}
1291
Alex Elderbf0d5f502012-11-22 00:00:08 -06001292static void rbd_obj_request_destroy(struct kref *kref);
1293static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1294{
1295 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001296 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001297 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001298 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1299}
1300
Alex Elder0f2d5be2014-04-26 14:21:44 +04001301static void rbd_img_request_get(struct rbd_img_request *img_request)
1302{
1303 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001304 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001305 kref_get(&img_request->kref);
1306}
1307
Alex Elderbf0d5f502012-11-22 00:00:08 -06001308static void rbd_img_request_destroy(struct kref *kref);
1309static void rbd_img_request_put(struct rbd_img_request *img_request)
1310{
1311 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001312 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001313 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001314 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001315}
1316
1317static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1318 struct rbd_obj_request *obj_request)
1319{
Alex Elder25dcf952013-01-25 17:08:55 -06001320 rbd_assert(obj_request->img_request == NULL);
1321
Alex Elderb155e862013-04-15 14:50:37 -05001322 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001323 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001324 img_request->obj_request_count++;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01001325 img_request->pending_count++;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001326 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001327}
1328
1329static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1330 struct rbd_obj_request *obj_request)
1331{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001332 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001333 list_del(&obj_request->ex.oe_item);
Alex Elder25dcf952013-01-25 17:08:55 -06001334 rbd_assert(img_request->obj_request_count > 0);
1335 img_request->obj_request_count--;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001336 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001337 rbd_obj_request_put(obj_request);
1338}
1339
Ilya Dryomov980917f2016-09-12 18:59:42 +02001340static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001341{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001342 struct ceph_osd_request *osd_req = obj_request->osd_req;
1343
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001344 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001345 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1346 obj_request->ex.oe_len, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001347 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001348}
1349
Alex Elder0c425242013-02-08 09:55:49 -06001350/*
1351 * The default/initial value for all image request flags is 0. Each
1352 * is conditionally set to 1 at image request initialization time
1353 * and currently never change thereafter.
1354 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001355static void img_request_layered_set(struct rbd_img_request *img_request)
1356{
1357 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1358 smp_mb();
1359}
1360
Alex Eldera2acd002013-05-08 22:50:04 -05001361static void img_request_layered_clear(struct rbd_img_request *img_request)
1362{
1363 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1364 smp_mb();
1365}
1366
Alex Elderd0b2e942013-01-24 16:13:36 -06001367static bool img_request_layered_test(struct rbd_img_request *img_request)
1368{
1369 smp_mb();
1370 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1371}
1372
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001373static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1374{
1375 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1376
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001377 return !obj_req->ex.oe_off &&
1378 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001379}
1380
1381static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1382{
1383 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1384
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001385 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001386 rbd_dev->layout.object_size;
1387}
1388
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001389static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1390{
1391 return ceph_file_extents_bytes(obj_req->img_extents,
1392 obj_req->num_img_extents);
1393}
1394
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001395static bool rbd_img_is_write(struct rbd_img_request *img_req)
1396{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001397 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001398 case OBJ_OP_READ:
1399 return false;
1400 case OBJ_OP_WRITE:
1401 case OBJ_OP_DISCARD:
1402 return true;
1403 default:
1404 rbd_assert(0);
1405 }
1406}
1407
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001408static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
1409
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001410static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001411{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001412 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001413
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001414 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1415 osd_req->r_result, obj_req);
1416 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001417
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001418 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1419 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1420 obj_req->xferred = osd_req->r_result;
1421 else
1422 /*
1423 * Writes aren't allowed to return a data payload. In some
1424 * guarded write cases (e.g. stat + zero on an empty object)
1425 * a stat response makes it through, but we don't care.
1426 */
1427 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001428
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001429 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001430}
1431
Alex Elder9d4df012013-04-19 15:34:50 -05001432static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001433{
Alex Elder8c042b02013-04-03 01:28:58 -05001434 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001435
Ilya Dryomova162b302018-01-30 17:52:10 +01001436 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001437 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001438}
1439
1440static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1441{
Alex Elder9d4df012013-04-19 15:34:50 -05001442 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001443
Ilya Dryomova162b302018-01-30 17:52:10 +01001444 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Deepa Dinamani1134e092017-05-08 15:59:19 -07001445 ktime_get_real_ts(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001446 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001447}
1448
Ilya Dryomovbc812072017-01-25 18:16:23 +01001449static struct ceph_osd_request *
Ilya Dryomova162b302018-01-30 17:52:10 +01001450rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001451{
Ilya Dryomova162b302018-01-30 17:52:10 +01001452 struct rbd_img_request *img_req = obj_req->img_request;
1453 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001454 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1455 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001456 const char *name_format = rbd_dev->image_format == 1 ?
1457 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001458
Ilya Dryomova162b302018-01-30 17:52:10 +01001459 req = ceph_osdc_alloc_request(osdc,
1460 (rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1461 num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001462 if (!req)
1463 return NULL;
1464
Ilya Dryomovbc812072017-01-25 18:16:23 +01001465 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001466 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001467
1468 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001469 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001470 rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001471 goto err_req;
1472
1473 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1474 goto err_req;
1475
1476 return req;
1477
1478err_req:
1479 ceph_osdc_put_request(req);
1480 return NULL;
1481}
1482
Alex Elderbf0d5f502012-11-22 00:00:08 -06001483static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1484{
1485 ceph_osdc_put_request(osd_req);
1486}
1487
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001488static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001489{
1490 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001491
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001492 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001493 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001494 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001495
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001496 ceph_object_extent_init(&obj_request->ex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001497 kref_init(&obj_request->kref);
1498
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001499 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001500 return obj_request;
1501}
1502
1503static void rbd_obj_request_destroy(struct kref *kref)
1504{
1505 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001506 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001507
1508 obj_request = container_of(kref, struct rbd_obj_request, kref);
1509
Alex Elder37206ee2013-02-20 17:32:08 -06001510 dout("%s: obj %p\n", __func__, obj_request);
1511
Alex Elderbf0d5f502012-11-22 00:00:08 -06001512 if (obj_request->osd_req)
1513 rbd_osd_req_destroy(obj_request->osd_req);
1514
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001515 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001516 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001517 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001518 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001519 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001520 case OBJ_REQUEST_OWN_BVECS:
1521 kfree(obj_request->bvec_pos.bvecs);
1522 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001523 default:
1524 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001525 }
1526
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001527 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001528 if (obj_request->copyup_bvecs) {
1529 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1530 if (obj_request->copyup_bvecs[i].bv_page)
1531 __free_page(obj_request->copyup_bvecs[i].bv_page);
1532 }
1533 kfree(obj_request->copyup_bvecs);
1534 }
Ilya Dryomovf9dcbc42018-01-20 10:30:11 +01001535
Alex Elder868311b2013-05-01 12:43:03 -05001536 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001537}
1538
Alex Elderfb65d2282013-05-08 22:50:04 -05001539/* It's OK to call this for a device with no parent */
1540
1541static void rbd_spec_put(struct rbd_spec *spec);
1542static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1543{
1544 rbd_dev_remove_parent(rbd_dev);
1545 rbd_spec_put(rbd_dev->parent_spec);
1546 rbd_dev->parent_spec = NULL;
1547 rbd_dev->parent_overlap = 0;
1548}
1549
Alex Elderbf0d5f502012-11-22 00:00:08 -06001550/*
Alex Eldera2acd002013-05-08 22:50:04 -05001551 * Parent image reference counting is used to determine when an
1552 * image's parent fields can be safely torn down--after there are no
1553 * more in-flight requests to the parent image. When the last
1554 * reference is dropped, cleaning them up is safe.
1555 */
1556static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1557{
1558 int counter;
1559
1560 if (!rbd_dev->parent_spec)
1561 return;
1562
1563 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1564 if (counter > 0)
1565 return;
1566
1567 /* Last reference; clean up parent data structures */
1568
1569 if (!counter)
1570 rbd_dev_unparent(rbd_dev);
1571 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001572 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001573}
1574
1575/*
1576 * If an image has a non-zero parent overlap, get a reference to its
1577 * parent.
1578 *
1579 * Returns true if the rbd device has a parent with a non-zero
1580 * overlap and a reference for it was successfully taken, or
1581 * false otherwise.
1582 */
1583static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1584{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001585 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001586
1587 if (!rbd_dev->parent_spec)
1588 return false;
1589
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001590 down_read(&rbd_dev->header_rwsem);
1591 if (rbd_dev->parent_overlap)
1592 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1593 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001594
1595 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001596 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001597
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001598 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001599}
1600
Alex Elderbf0d5f502012-11-22 00:00:08 -06001601/*
1602 * Caller is responsible for filling in the list of object requests
1603 * that comprises the image request, and the Linux request pointer
1604 * (if there is one).
1605 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001606static struct rbd_img_request *rbd_img_request_create(
1607 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001608 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001609 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001610{
1611 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001612
Ilya Dryomova0c58952018-01-22 16:03:06 +01001613 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001614 if (!img_request)
1615 return NULL;
1616
Alex Elderbf0d5f502012-11-22 00:00:08 -06001617 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001618 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001619 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001620 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001621 else
1622 img_request->snapc = snapc;
1623
Alex Eldera2acd002013-05-08 22:50:04 -05001624 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001625 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001626
Alex Elderbf0d5f502012-11-22 00:00:08 -06001627 spin_lock_init(&img_request->completion_lock);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001628 INIT_LIST_HEAD(&img_request->object_extents);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001629 kref_init(&img_request->kref);
1630
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001631 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1632 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001633 return img_request;
1634}
1635
1636static void rbd_img_request_destroy(struct kref *kref)
1637{
1638 struct rbd_img_request *img_request;
1639 struct rbd_obj_request *obj_request;
1640 struct rbd_obj_request *next_obj_request;
1641
1642 img_request = container_of(kref, struct rbd_img_request, kref);
1643
Alex Elder37206ee2013-02-20 17:32:08 -06001644 dout("%s: img %p\n", __func__, img_request);
1645
Alex Elderbf0d5f502012-11-22 00:00:08 -06001646 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1647 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001648 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001649
Alex Eldera2acd002013-05-08 22:50:04 -05001650 if (img_request_layered_test(img_request)) {
1651 img_request_layered_clear(img_request);
1652 rbd_dev_parent_put(img_request->rbd_dev);
1653 }
1654
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001655 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001656 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001657
Alex Elder1c2a9df2013-05-01 12:43:03 -05001658 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001659}
1660
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001661static void prune_extents(struct ceph_file_extent *img_extents,
1662 u32 *num_img_extents, u64 overlap)
1663{
1664 u32 cnt = *num_img_extents;
1665
1666 /* drop extents completely beyond the overlap */
1667 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1668 cnt--;
1669
1670 if (cnt) {
1671 struct ceph_file_extent *ex = &img_extents[cnt - 1];
1672
1673 /* trim final overlapping extent */
1674 if (ex->fe_off + ex->fe_len > overlap)
1675 ex->fe_len = overlap - ex->fe_off;
1676 }
1677
1678 *num_img_extents = cnt;
1679}
1680
1681/*
1682 * Determine the byte range(s) covered by either just the object extent
1683 * or the entire object in the parent image.
1684 */
1685static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1686 bool entire)
1687{
1688 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1689 int ret;
1690
1691 if (!rbd_dev->parent_overlap)
1692 return 0;
1693
1694 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1695 entire ? 0 : obj_req->ex.oe_off,
1696 entire ? rbd_dev->layout.object_size :
1697 obj_req->ex.oe_len,
1698 &obj_req->img_extents,
1699 &obj_req->num_img_extents);
1700 if (ret)
1701 return ret;
1702
1703 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1704 rbd_dev->parent_overlap);
1705 return 0;
1706}
1707
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001708static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1709{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001710 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001711 case OBJ_REQUEST_BIO:
1712 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1713 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001714 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001715 break;
1716 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001717 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001718 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001719 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001720 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001721 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1722 &obj_req->bvec_pos);
1723 break;
1724 default:
1725 rbd_assert(0);
1726 }
1727}
1728
1729static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1730{
Ilya Dryomova162b302018-01-30 17:52:10 +01001731 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001732 if (!obj_req->osd_req)
1733 return -ENOMEM;
1734
1735 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001736 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001737 rbd_osd_req_setup_data(obj_req, 0);
1738
1739 rbd_osd_req_format_read(obj_req);
1740 return 0;
1741}
1742
1743static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1744 unsigned int which)
1745{
1746 struct page **pages;
1747
1748 /*
1749 * The response data for a STAT call consists of:
1750 * le64 length;
1751 * struct {
1752 * le32 tv_sec;
1753 * le32 tv_nsec;
1754 * } mtime;
1755 */
1756 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1757 if (IS_ERR(pages))
1758 return PTR_ERR(pages);
1759
1760 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1761 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1762 8 + sizeof(struct ceph_timespec),
1763 0, false, true);
1764 return 0;
1765}
1766
1767static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1768 unsigned int which)
1769{
1770 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1771 u16 opcode;
1772
1773 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
1774 rbd_dev->layout.object_size,
1775 rbd_dev->layout.object_size);
1776
1777 if (rbd_obj_is_entire(obj_req))
1778 opcode = CEPH_OSD_OP_WRITEFULL;
1779 else
1780 opcode = CEPH_OSD_OP_WRITE;
1781
1782 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001783 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001784 rbd_osd_req_setup_data(obj_req, which++);
1785
1786 rbd_assert(which == obj_req->osd_req->r_num_ops);
1787 rbd_osd_req_format_write(obj_req);
1788}
1789
1790static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
1791{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001792 unsigned int num_osd_ops, which = 0;
1793 int ret;
1794
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001795 /* reverse map the entire object onto the parent */
1796 ret = rbd_obj_calc_img_extents(obj_req, true);
1797 if (ret)
1798 return ret;
1799
1800 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001801 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1802 num_osd_ops = 3; /* stat + setallochint + write/writefull */
1803 } else {
1804 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1805 num_osd_ops = 2; /* setallochint + write/writefull */
1806 }
1807
Ilya Dryomova162b302018-01-30 17:52:10 +01001808 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001809 if (!obj_req->osd_req)
1810 return -ENOMEM;
1811
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001812 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001813 ret = __rbd_obj_setup_stat(obj_req, which++);
1814 if (ret)
1815 return ret;
1816 }
1817
1818 __rbd_obj_setup_write(obj_req, which);
1819 return 0;
1820}
1821
1822static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
1823 unsigned int which)
1824{
1825 u16 opcode;
1826
1827 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001828 if (obj_req->num_img_extents) {
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001829 osd_req_op_init(obj_req->osd_req, which++,
1830 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001831 opcode = CEPH_OSD_OP_TRUNCATE;
1832 } else {
1833 osd_req_op_init(obj_req->osd_req, which++,
1834 CEPH_OSD_OP_DELETE, 0);
1835 opcode = 0;
1836 }
1837 } else if (rbd_obj_is_tail(obj_req)) {
1838 opcode = CEPH_OSD_OP_TRUNCATE;
1839 } else {
1840 opcode = CEPH_OSD_OP_ZERO;
1841 }
1842
1843 if (opcode)
1844 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001845 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001846 0, 0);
1847
1848 rbd_assert(which == obj_req->osd_req->r_num_ops);
1849 rbd_osd_req_format_write(obj_req);
1850}
1851
1852static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1853{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001854 unsigned int num_osd_ops, which = 0;
1855 int ret;
1856
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001857 /* reverse map the entire object onto the parent */
1858 ret = rbd_obj_calc_img_extents(obj_req, true);
1859 if (ret)
1860 return ret;
1861
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001862 if (rbd_obj_is_entire(obj_req)) {
1863 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001864 if (obj_req->num_img_extents)
1865 num_osd_ops = 2; /* create + truncate */
1866 else
1867 num_osd_ops = 1; /* delete */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001868 } else {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001869 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001870 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1871 num_osd_ops = 2; /* stat + truncate/zero */
1872 } else {
1873 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1874 num_osd_ops = 1; /* truncate/zero */
1875 }
1876 }
1877
Ilya Dryomova162b302018-01-30 17:52:10 +01001878 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001879 if (!obj_req->osd_req)
1880 return -ENOMEM;
1881
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001882 if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001883 ret = __rbd_obj_setup_stat(obj_req, which++);
1884 if (ret)
1885 return ret;
1886 }
1887
1888 __rbd_obj_setup_discard(obj_req, which);
1889 return 0;
1890}
1891
1892/*
1893 * For each object request in @img_req, allocate an OSD request, add
1894 * individual OSD ops and prepare them for submission. The number of
1895 * OSD ops depends on op_type and the overlap point (if any).
1896 */
1897static int __rbd_img_fill_request(struct rbd_img_request *img_req)
1898{
1899 struct rbd_obj_request *obj_req;
1900 int ret;
1901
1902 for_each_obj_request(img_req, obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001903 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001904 case OBJ_OP_READ:
1905 ret = rbd_obj_setup_read(obj_req);
1906 break;
1907 case OBJ_OP_WRITE:
1908 ret = rbd_obj_setup_write(obj_req);
1909 break;
1910 case OBJ_OP_DISCARD:
1911 ret = rbd_obj_setup_discard(obj_req);
1912 break;
1913 default:
1914 rbd_assert(0);
1915 }
1916 if (ret)
1917 return ret;
1918 }
1919
1920 return 0;
1921}
1922
Ilya Dryomov5a237812018-02-06 19:26:34 +01001923union rbd_img_fill_iter {
1924 struct ceph_bio_iter bio_iter;
1925 struct ceph_bvec_iter bvec_iter;
1926};
1927
1928struct rbd_img_fill_ctx {
1929 enum obj_request_type pos_type;
1930 union rbd_img_fill_iter *pos;
1931 union rbd_img_fill_iter iter;
1932 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01001933 ceph_object_extent_fn_t count_fn;
1934 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01001935};
1936
1937static struct ceph_object_extent *alloc_object_extent(void *arg)
1938{
1939 struct rbd_img_request *img_req = arg;
1940 struct rbd_obj_request *obj_req;
1941
1942 obj_req = rbd_obj_request_create();
1943 if (!obj_req)
1944 return NULL;
1945
1946 rbd_img_obj_request_add(img_req, obj_req);
1947 return &obj_req->ex;
1948}
1949
1950/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01001951 * While su != os && sc == 1 is technically not fancy (it's the same
1952 * layout as su == os && sc == 1), we can't use the nocopy path for it
1953 * because ->set_pos_fn() should be called only once per object.
1954 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
1955 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01001956 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001957static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
1958{
1959 return l->stripe_unit != l->object_size;
1960}
1961
1962static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
1963 struct ceph_file_extent *img_extents,
1964 u32 num_img_extents,
1965 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01001966{
1967 u32 i;
1968 int ret;
1969
1970 img_req->data_type = fctx->pos_type;
1971
1972 /*
1973 * Create object requests and set each object request's starting
1974 * position in the provided bio (list) or bio_vec array.
1975 */
1976 fctx->iter = *fctx->pos;
1977 for (i = 0; i < num_img_extents; i++) {
1978 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
1979 img_extents[i].fe_off,
1980 img_extents[i].fe_len,
1981 &img_req->object_extents,
1982 alloc_object_extent, img_req,
1983 fctx->set_pos_fn, &fctx->iter);
1984 if (ret)
1985 return ret;
1986 }
1987
1988 return __rbd_img_fill_request(img_req);
1989}
1990
Ilya Dryomovafb97882018-02-06 19:26:35 +01001991/*
1992 * Map a list of image extents to a list of object extents, create the
1993 * corresponding object requests (normally each to a different object,
1994 * but not always) and add them to @img_req. For each object request,
1995 * set up its data descriptor to point to the corresponding chunk(s) of
1996 * @fctx->pos data buffer.
1997 *
1998 * Because ceph_file_to_extents() will merge adjacent object extents
1999 * together, each object request's data descriptor may point to multiple
2000 * different chunks of @fctx->pos data buffer.
2001 *
2002 * @fctx->pos data buffer is assumed to be large enough.
2003 */
2004static int rbd_img_fill_request(struct rbd_img_request *img_req,
2005 struct ceph_file_extent *img_extents,
2006 u32 num_img_extents,
2007 struct rbd_img_fill_ctx *fctx)
2008{
2009 struct rbd_device *rbd_dev = img_req->rbd_dev;
2010 struct rbd_obj_request *obj_req;
2011 u32 i;
2012 int ret;
2013
2014 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2015 !rbd_layout_is_fancy(&rbd_dev->layout))
2016 return rbd_img_fill_request_nocopy(img_req, img_extents,
2017 num_img_extents, fctx);
2018
2019 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2020
2021 /*
2022 * Create object requests and determine ->bvec_count for each object
2023 * request. Note that ->bvec_count sum over all object requests may
2024 * be greater than the number of bio_vecs in the provided bio (list)
2025 * or bio_vec array because when mapped, those bio_vecs can straddle
2026 * stripe unit boundaries.
2027 */
2028 fctx->iter = *fctx->pos;
2029 for (i = 0; i < num_img_extents; i++) {
2030 ret = ceph_file_to_extents(&rbd_dev->layout,
2031 img_extents[i].fe_off,
2032 img_extents[i].fe_len,
2033 &img_req->object_extents,
2034 alloc_object_extent, img_req,
2035 fctx->count_fn, &fctx->iter);
2036 if (ret)
2037 return ret;
2038 }
2039
2040 for_each_obj_request(img_req, obj_req) {
2041 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2042 sizeof(*obj_req->bvec_pos.bvecs),
2043 GFP_NOIO);
2044 if (!obj_req->bvec_pos.bvecs)
2045 return -ENOMEM;
2046 }
2047
2048 /*
2049 * Fill in each object request's private bio_vec array, splitting and
2050 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2051 */
2052 fctx->iter = *fctx->pos;
2053 for (i = 0; i < num_img_extents; i++) {
2054 ret = ceph_iterate_extents(&rbd_dev->layout,
2055 img_extents[i].fe_off,
2056 img_extents[i].fe_len,
2057 &img_req->object_extents,
2058 fctx->copy_fn, &fctx->iter);
2059 if (ret)
2060 return ret;
2061 }
2062
2063 return __rbd_img_fill_request(img_req);
2064}
2065
Ilya Dryomov5a237812018-02-06 19:26:34 +01002066static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2067 u64 off, u64 len)
2068{
2069 struct ceph_file_extent ex = { off, len };
2070 union rbd_img_fill_iter dummy;
2071 struct rbd_img_fill_ctx fctx = {
2072 .pos_type = OBJ_REQUEST_NODATA,
2073 .pos = &dummy,
2074 };
2075
2076 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2077}
2078
2079static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2080{
2081 struct rbd_obj_request *obj_req =
2082 container_of(ex, struct rbd_obj_request, ex);
2083 struct ceph_bio_iter *it = arg;
2084
2085 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2086 obj_req->bio_pos = *it;
2087 ceph_bio_iter_advance(it, bytes);
2088}
2089
Ilya Dryomovafb97882018-02-06 19:26:35 +01002090static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2091{
2092 struct rbd_obj_request *obj_req =
2093 container_of(ex, struct rbd_obj_request, ex);
2094 struct ceph_bio_iter *it = arg;
2095
2096 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2097 ceph_bio_iter_advance_step(it, bytes, ({
2098 obj_req->bvec_count++;
2099 }));
2100
2101}
2102
2103static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2104{
2105 struct rbd_obj_request *obj_req =
2106 container_of(ex, struct rbd_obj_request, ex);
2107 struct ceph_bio_iter *it = arg;
2108
2109 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2110 ceph_bio_iter_advance_step(it, bytes, ({
2111 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2112 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2113 }));
2114}
2115
Ilya Dryomov5a237812018-02-06 19:26:34 +01002116static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2117 struct ceph_file_extent *img_extents,
2118 u32 num_img_extents,
2119 struct ceph_bio_iter *bio_pos)
2120{
2121 struct rbd_img_fill_ctx fctx = {
2122 .pos_type = OBJ_REQUEST_BIO,
2123 .pos = (union rbd_img_fill_iter *)bio_pos,
2124 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002125 .count_fn = count_bio_bvecs,
2126 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002127 };
2128
2129 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2130 &fctx);
2131}
2132
2133static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2134 u64 off, u64 len, struct bio *bio)
2135{
2136 struct ceph_file_extent ex = { off, len };
2137 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2138
2139 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2140}
2141
2142static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2143{
2144 struct rbd_obj_request *obj_req =
2145 container_of(ex, struct rbd_obj_request, ex);
2146 struct ceph_bvec_iter *it = arg;
2147
2148 obj_req->bvec_pos = *it;
2149 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2150 ceph_bvec_iter_advance(it, bytes);
2151}
2152
Ilya Dryomovafb97882018-02-06 19:26:35 +01002153static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2154{
2155 struct rbd_obj_request *obj_req =
2156 container_of(ex, struct rbd_obj_request, ex);
2157 struct ceph_bvec_iter *it = arg;
2158
2159 ceph_bvec_iter_advance_step(it, bytes, ({
2160 obj_req->bvec_count++;
2161 }));
2162}
2163
2164static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2165{
2166 struct rbd_obj_request *obj_req =
2167 container_of(ex, struct rbd_obj_request, ex);
2168 struct ceph_bvec_iter *it = arg;
2169
2170 ceph_bvec_iter_advance_step(it, bytes, ({
2171 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2172 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2173 }));
2174}
2175
Ilya Dryomov5a237812018-02-06 19:26:34 +01002176static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2177 struct ceph_file_extent *img_extents,
2178 u32 num_img_extents,
2179 struct ceph_bvec_iter *bvec_pos)
2180{
2181 struct rbd_img_fill_ctx fctx = {
2182 .pos_type = OBJ_REQUEST_BVECS,
2183 .pos = (union rbd_img_fill_iter *)bvec_pos,
2184 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002185 .count_fn = count_bvecs,
2186 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002187 };
2188
2189 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2190 &fctx);
2191}
2192
2193static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2194 struct ceph_file_extent *img_extents,
2195 u32 num_img_extents,
2196 struct bio_vec *bvecs)
2197{
2198 struct ceph_bvec_iter it = {
2199 .bvecs = bvecs,
2200 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2201 num_img_extents) },
2202 };
2203
2204 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2205 &it);
2206}
2207
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002208static void rbd_img_request_submit(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002209{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002210 struct rbd_obj_request *obj_request;
2211
Alex Elder37206ee2013-02-20 17:32:08 -06002212 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002213
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002214 rbd_img_request_get(img_request);
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002215 for_each_obj_request(img_request, obj_request)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002216 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002217
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002218 rbd_img_request_put(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002219}
2220
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002221static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002222{
2223 struct rbd_img_request *img_req = obj_req->img_request;
2224 struct rbd_img_request *child_img_req;
2225 int ret;
2226
Ilya Dryomove93aca02018-02-06 19:26:35 +01002227 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2228 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002229 if (!child_img_req)
2230 return -ENOMEM;
2231
Ilya Dryomove93aca02018-02-06 19:26:35 +01002232 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2233 child_img_req->obj_request = obj_req;
2234
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002235 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002236 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002237 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002238 ret = __rbd_img_fill_from_bio(child_img_req,
2239 obj_req->img_extents,
2240 obj_req->num_img_extents,
2241 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002242 break;
2243 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002244 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002245 ret = __rbd_img_fill_from_bvecs(child_img_req,
2246 obj_req->img_extents,
2247 obj_req->num_img_extents,
2248 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002249 break;
2250 default:
2251 rbd_assert(0);
2252 }
2253 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002254 ret = rbd_img_fill_from_bvecs(child_img_req,
2255 obj_req->img_extents,
2256 obj_req->num_img_extents,
2257 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002258 }
2259 if (ret) {
2260 rbd_img_request_put(child_img_req);
2261 return ret;
2262 }
2263
2264 rbd_img_request_submit(child_img_req);
2265 return 0;
2266}
2267
2268static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2269{
2270 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2271 int ret;
2272
2273 if (obj_req->result == -ENOENT &&
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002274 rbd_dev->parent_overlap && !obj_req->tried_parent) {
2275 /* reverse map this object extent onto the parent */
2276 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002277 if (ret) {
2278 obj_req->result = ret;
2279 return true;
2280 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002281
2282 if (obj_req->num_img_extents) {
2283 obj_req->tried_parent = true;
2284 ret = rbd_obj_read_from_parent(obj_req);
2285 if (ret) {
2286 obj_req->result = ret;
2287 return true;
2288 }
2289 return false;
2290 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002291 }
2292
2293 /*
2294 * -ENOENT means a hole in the image -- zero-fill the entire
2295 * length of the request. A short read also implies zero-fill
2296 * to the end of the request. In both cases we update xferred
2297 * count to indicate the whole request was satisfied.
2298 */
2299 if (obj_req->result == -ENOENT ||
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002300 (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002301 rbd_assert(!obj_req->xferred || !obj_req->result);
2302 rbd_obj_zero_range(obj_req, obj_req->xferred,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002303 obj_req->ex.oe_len - obj_req->xferred);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002304 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002305 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002306 }
2307
2308 return true;
2309}
2310
2311/*
2312 * copyup_bvecs pages are never highmem pages
2313 */
2314static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2315{
2316 struct ceph_bvec_iter it = {
2317 .bvecs = bvecs,
2318 .iter = { .bi_size = bytes },
2319 };
2320
2321 ceph_bvec_iter_advance_step(&it, bytes, ({
2322 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2323 bv.bv_len))
2324 return false;
2325 }));
2326 return true;
2327}
2328
2329static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2330{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002331 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
2332
2333 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2334 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2335 rbd_osd_req_destroy(obj_req->osd_req);
2336
2337 /*
2338 * Create a copyup request with the same number of OSD ops as
2339 * the original request. The original request was stat + op(s),
2340 * the new copyup request will be copyup + the same op(s).
2341 */
Ilya Dryomova162b302018-01-30 17:52:10 +01002342 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002343 if (!obj_req->osd_req)
2344 return -ENOMEM;
2345
2346 /*
2347 * Only send non-zero copyup data to save some I/O and network
2348 * bandwidth -- zero copyup data is equivalent to the object not
2349 * existing.
2350 */
2351 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2352 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2353 bytes = 0;
2354 }
2355
2356 osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
2357 "copyup");
2358 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
2359 obj_req->copyup_bvecs, bytes);
2360
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002361 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002362 case OBJ_OP_WRITE:
2363 __rbd_obj_setup_write(obj_req, 1);
2364 break;
2365 case OBJ_OP_DISCARD:
2366 rbd_assert(!rbd_obj_is_entire(obj_req));
2367 __rbd_obj_setup_discard(obj_req, 1);
2368 break;
2369 default:
2370 rbd_assert(0);
2371 }
2372
2373 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002374 return 0;
2375}
2376
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002377static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2378{
2379 u32 i;
2380
2381 rbd_assert(!obj_req->copyup_bvecs);
2382 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2383 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2384 sizeof(*obj_req->copyup_bvecs),
2385 GFP_NOIO);
2386 if (!obj_req->copyup_bvecs)
2387 return -ENOMEM;
2388
2389 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2390 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2391
2392 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2393 if (!obj_req->copyup_bvecs[i].bv_page)
2394 return -ENOMEM;
2395
2396 obj_req->copyup_bvecs[i].bv_offset = 0;
2397 obj_req->copyup_bvecs[i].bv_len = len;
2398 obj_overlap -= len;
2399 }
2400
2401 rbd_assert(!obj_overlap);
2402 return 0;
2403}
2404
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002405static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2406{
2407 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002408 int ret;
2409
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002410 rbd_assert(obj_req->num_img_extents);
2411 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2412 rbd_dev->parent_overlap);
2413 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002414 /*
2415 * The overlap has become 0 (most likely because the
2416 * image has been flattened). Use rbd_obj_issue_copyup()
2417 * to re-submit the original write request -- the copyup
2418 * operation itself will be a no-op, since someone must
2419 * have populated the child object while we weren't
2420 * looking. Move to WRITE_FLAT state as we'll be done
2421 * with the operation once the null copyup completes.
2422 */
2423 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2424 return rbd_obj_issue_copyup(obj_req, 0);
2425 }
2426
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002427 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002428 if (ret)
2429 return ret;
2430
2431 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002432 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002433}
2434
2435static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2436{
2437 int ret;
2438
2439again:
2440 switch (obj_req->write_state) {
2441 case RBD_OBJ_WRITE_GUARD:
2442 rbd_assert(!obj_req->xferred);
2443 if (obj_req->result == -ENOENT) {
2444 /*
2445 * The target object doesn't exist. Read the data for
2446 * the entire target object up to the overlap point (if
2447 * any) from the parent, so we can use it for a copyup.
2448 */
2449 ret = rbd_obj_handle_write_guard(obj_req);
2450 if (ret) {
2451 obj_req->result = ret;
2452 return true;
2453 }
2454 return false;
2455 }
2456 /* fall through */
2457 case RBD_OBJ_WRITE_FLAT:
2458 if (!obj_req->result)
2459 /*
2460 * There is no such thing as a successful short
2461 * write -- indicate the whole request was satisfied.
2462 */
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002463 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002464 return true;
2465 case RBD_OBJ_WRITE_COPYUP:
2466 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2467 if (obj_req->result)
2468 goto again;
2469
2470 rbd_assert(obj_req->xferred);
2471 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2472 if (ret) {
2473 obj_req->result = ret;
2474 return true;
2475 }
2476 return false;
2477 default:
2478 rbd_assert(0);
2479 }
2480}
2481
2482/*
2483 * Returns true if @obj_req is completed, or false otherwise.
2484 */
2485static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2486{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002487 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002488 case OBJ_OP_READ:
2489 return rbd_obj_handle_read(obj_req);
2490 case OBJ_OP_WRITE:
2491 return rbd_obj_handle_write(obj_req);
2492 case OBJ_OP_DISCARD:
2493 if (rbd_obj_handle_write(obj_req)) {
2494 /*
2495 * Hide -ENOENT from delete/truncate/zero -- discarding
2496 * a non-existent object is not a problem.
2497 */
2498 if (obj_req->result == -ENOENT) {
2499 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002500 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002501 }
2502 return true;
2503 }
2504 return false;
2505 default:
2506 rbd_assert(0);
2507 }
2508}
2509
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002510static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2511{
2512 struct rbd_img_request *img_req = obj_req->img_request;
2513
2514 rbd_assert((!obj_req->result &&
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002515 obj_req->xferred == obj_req->ex.oe_len) ||
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002516 (obj_req->result < 0 && !obj_req->xferred));
2517 if (!obj_req->result) {
2518 img_req->xferred += obj_req->xferred;
2519 return;
2520 }
2521
2522 rbd_warn(img_req->rbd_dev,
2523 "%s at objno %llu %llu~%llu result %d xferred %llu",
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002524 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2525 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002526 obj_req->xferred);
2527 if (!img_req->result) {
2528 img_req->result = obj_req->result;
2529 img_req->xferred = 0;
2530 }
2531}
2532
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002533static void rbd_img_end_child_request(struct rbd_img_request *img_req)
2534{
2535 struct rbd_obj_request *obj_req = img_req->obj_request;
2536
2537 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002538 rbd_assert((!img_req->result &&
2539 img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
2540 (img_req->result < 0 && !img_req->xferred));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002541
2542 obj_req->result = img_req->result;
2543 obj_req->xferred = img_req->xferred;
2544 rbd_img_request_put(img_req);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002545}
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002546
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002547static void rbd_img_end_request(struct rbd_img_request *img_req)
2548{
2549 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2550 rbd_assert((!img_req->result &&
2551 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2552 (img_req->result < 0 && !img_req->xferred));
2553
2554 blk_mq_end_request(img_req->rq,
2555 errno_to_blk_status(img_req->result));
2556 rbd_img_request_put(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002557}
2558
2559static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2560{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002561 struct rbd_img_request *img_req;
2562
2563again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002564 if (!__rbd_obj_handle_request(obj_req))
2565 return;
2566
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002567 img_req = obj_req->img_request;
2568 spin_lock(&img_req->completion_lock);
2569 rbd_obj_end_request(obj_req);
2570 rbd_assert(img_req->pending_count);
2571 if (--img_req->pending_count) {
2572 spin_unlock(&img_req->completion_lock);
2573 return;
2574 }
2575
2576 spin_unlock(&img_req->completion_lock);
2577 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2578 obj_req = img_req->obj_request;
2579 rbd_img_end_child_request(img_req);
2580 goto again;
2581 }
2582 rbd_img_end_request(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002583}
2584
Ilya Dryomoved95b212016-08-12 16:40:02 +02002585static const struct rbd_client_id rbd_empty_cid;
2586
2587static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2588 const struct rbd_client_id *rhs)
2589{
2590 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2591}
2592
2593static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2594{
2595 struct rbd_client_id cid;
2596
2597 mutex_lock(&rbd_dev->watch_mutex);
2598 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2599 cid.handle = rbd_dev->watch_cookie;
2600 mutex_unlock(&rbd_dev->watch_mutex);
2601 return cid;
2602}
2603
2604/*
2605 * lock_rwsem must be held for write
2606 */
2607static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2608 const struct rbd_client_id *cid)
2609{
2610 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2611 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2612 cid->gid, cid->handle);
2613 rbd_dev->owner_cid = *cid; /* struct */
2614}
2615
2616static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2617{
2618 mutex_lock(&rbd_dev->watch_mutex);
2619 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2620 mutex_unlock(&rbd_dev->watch_mutex);
2621}
2622
Florian Margaineedd8ca82017-12-13 16:43:59 +01002623static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2624{
2625 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2626
2627 strcpy(rbd_dev->lock_cookie, cookie);
2628 rbd_set_owner_cid(rbd_dev, &cid);
2629 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2630}
2631
Ilya Dryomoved95b212016-08-12 16:40:02 +02002632/*
2633 * lock_rwsem must be held for write
2634 */
2635static int rbd_lock(struct rbd_device *rbd_dev)
2636{
2637 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002638 char cookie[32];
2639 int ret;
2640
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002641 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2642 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002643
2644 format_lock_cookie(rbd_dev, cookie);
2645 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2646 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2647 RBD_LOCK_TAG, "", 0);
2648 if (ret)
2649 return ret;
2650
2651 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002652 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002653 return 0;
2654}
2655
2656/*
2657 * lock_rwsem must be held for write
2658 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002659static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002660{
2661 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002662 int ret;
2663
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002664 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2665 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002666
Ilya Dryomoved95b212016-08-12 16:40:02 +02002667 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002668 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002669 if (ret && ret != -ENOENT)
2670 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002671
Ilya Dryomovbbead742017-04-13 12:17:38 +02002672 /* treat errors as the image is unlocked */
2673 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002674 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002675 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2676 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002677}
2678
2679static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2680 enum rbd_notify_op notify_op,
2681 struct page ***preply_pages,
2682 size_t *preply_len)
2683{
2684 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2685 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2686 int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
2687 char buf[buf_size];
2688 void *p = buf;
2689
2690 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2691
2692 /* encode *LockPayload NotifyMessage (op + ClientId) */
2693 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2694 ceph_encode_32(&p, notify_op);
2695 ceph_encode_64(&p, cid.gid);
2696 ceph_encode_64(&p, cid.handle);
2697
2698 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2699 &rbd_dev->header_oloc, buf, buf_size,
2700 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2701}
2702
2703static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2704 enum rbd_notify_op notify_op)
2705{
2706 struct page **reply_pages;
2707 size_t reply_len;
2708
2709 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2710 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2711}
2712
2713static void rbd_notify_acquired_lock(struct work_struct *work)
2714{
2715 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2716 acquired_lock_work);
2717
2718 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2719}
2720
2721static void rbd_notify_released_lock(struct work_struct *work)
2722{
2723 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2724 released_lock_work);
2725
2726 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2727}
2728
2729static int rbd_request_lock(struct rbd_device *rbd_dev)
2730{
2731 struct page **reply_pages;
2732 size_t reply_len;
2733 bool lock_owner_responded = false;
2734 int ret;
2735
2736 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2737
2738 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2739 &reply_pages, &reply_len);
2740 if (ret && ret != -ETIMEDOUT) {
2741 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2742 goto out;
2743 }
2744
2745 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2746 void *p = page_address(reply_pages[0]);
2747 void *const end = p + reply_len;
2748 u32 n;
2749
2750 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2751 while (n--) {
2752 u8 struct_v;
2753 u32 len;
2754
2755 ceph_decode_need(&p, end, 8 + 8, e_inval);
2756 p += 8 + 8; /* skip gid and cookie */
2757
2758 ceph_decode_32_safe(&p, end, len, e_inval);
2759 if (!len)
2760 continue;
2761
2762 if (lock_owner_responded) {
2763 rbd_warn(rbd_dev,
2764 "duplicate lock owners detected");
2765 ret = -EIO;
2766 goto out;
2767 }
2768
2769 lock_owner_responded = true;
2770 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2771 &struct_v, &len);
2772 if (ret) {
2773 rbd_warn(rbd_dev,
2774 "failed to decode ResponseMessage: %d",
2775 ret);
2776 goto e_inval;
2777 }
2778
2779 ret = ceph_decode_32(&p);
2780 }
2781 }
2782
2783 if (!lock_owner_responded) {
2784 rbd_warn(rbd_dev, "no lock owners detected");
2785 ret = -ETIMEDOUT;
2786 }
2787
2788out:
2789 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2790 return ret;
2791
2792e_inval:
2793 ret = -EINVAL;
2794 goto out;
2795}
2796
2797static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2798{
2799 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2800
2801 cancel_delayed_work(&rbd_dev->lock_dwork);
2802 if (wake_all)
2803 wake_up_all(&rbd_dev->lock_waitq);
2804 else
2805 wake_up(&rbd_dev->lock_waitq);
2806}
2807
2808static int get_lock_owner_info(struct rbd_device *rbd_dev,
2809 struct ceph_locker **lockers, u32 *num_lockers)
2810{
2811 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2812 u8 lock_type;
2813 char *lock_tag;
2814 int ret;
2815
2816 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2817
2818 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2819 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2820 &lock_type, &lock_tag, lockers, num_lockers);
2821 if (ret)
2822 return ret;
2823
2824 if (*num_lockers == 0) {
2825 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2826 goto out;
2827 }
2828
2829 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2830 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2831 lock_tag);
2832 ret = -EBUSY;
2833 goto out;
2834 }
2835
2836 if (lock_type == CEPH_CLS_LOCK_SHARED) {
2837 rbd_warn(rbd_dev, "shared lock type detected");
2838 ret = -EBUSY;
2839 goto out;
2840 }
2841
2842 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2843 strlen(RBD_LOCK_COOKIE_PREFIX))) {
2844 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2845 (*lockers)[0].id.cookie);
2846 ret = -EBUSY;
2847 goto out;
2848 }
2849
2850out:
2851 kfree(lock_tag);
2852 return ret;
2853}
2854
2855static int find_watcher(struct rbd_device *rbd_dev,
2856 const struct ceph_locker *locker)
2857{
2858 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2859 struct ceph_watch_item *watchers;
2860 u32 num_watchers;
2861 u64 cookie;
2862 int i;
2863 int ret;
2864
2865 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2866 &rbd_dev->header_oloc, &watchers,
2867 &num_watchers);
2868 if (ret)
2869 return ret;
2870
2871 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2872 for (i = 0; i < num_watchers; i++) {
2873 if (!memcmp(&watchers[i].addr, &locker->info.addr,
2874 sizeof(locker->info.addr)) &&
2875 watchers[i].cookie == cookie) {
2876 struct rbd_client_id cid = {
2877 .gid = le64_to_cpu(watchers[i].name.num),
2878 .handle = cookie,
2879 };
2880
2881 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
2882 rbd_dev, cid.gid, cid.handle);
2883 rbd_set_owner_cid(rbd_dev, &cid);
2884 ret = 1;
2885 goto out;
2886 }
2887 }
2888
2889 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
2890 ret = 0;
2891out:
2892 kfree(watchers);
2893 return ret;
2894}
2895
2896/*
2897 * lock_rwsem must be held for write
2898 */
2899static int rbd_try_lock(struct rbd_device *rbd_dev)
2900{
2901 struct ceph_client *client = rbd_dev->rbd_client->client;
2902 struct ceph_locker *lockers;
2903 u32 num_lockers;
2904 int ret;
2905
2906 for (;;) {
2907 ret = rbd_lock(rbd_dev);
2908 if (ret != -EBUSY)
2909 return ret;
2910
2911 /* determine if the current lock holder is still alive */
2912 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
2913 if (ret)
2914 return ret;
2915
2916 if (num_lockers == 0)
2917 goto again;
2918
2919 ret = find_watcher(rbd_dev, lockers);
2920 if (ret) {
2921 if (ret > 0)
2922 ret = 0; /* have to request lock */
2923 goto out;
2924 }
2925
2926 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
2927 ENTITY_NAME(lockers[0].id.name));
2928
2929 ret = ceph_monc_blacklist_add(&client->monc,
2930 &lockers[0].info.addr);
2931 if (ret) {
2932 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
2933 ENTITY_NAME(lockers[0].id.name), ret);
2934 goto out;
2935 }
2936
2937 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
2938 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2939 lockers[0].id.cookie,
2940 &lockers[0].id.name);
2941 if (ret && ret != -ENOENT)
2942 goto out;
2943
2944again:
2945 ceph_free_lockers(lockers, num_lockers);
2946 }
2947
2948out:
2949 ceph_free_lockers(lockers, num_lockers);
2950 return ret;
2951}
2952
2953/*
2954 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
2955 */
2956static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
2957 int *pret)
2958{
2959 enum rbd_lock_state lock_state;
2960
2961 down_read(&rbd_dev->lock_rwsem);
2962 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
2963 rbd_dev->lock_state);
2964 if (__rbd_is_lock_owner(rbd_dev)) {
2965 lock_state = rbd_dev->lock_state;
2966 up_read(&rbd_dev->lock_rwsem);
2967 return lock_state;
2968 }
2969
2970 up_read(&rbd_dev->lock_rwsem);
2971 down_write(&rbd_dev->lock_rwsem);
2972 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
2973 rbd_dev->lock_state);
2974 if (!__rbd_is_lock_owner(rbd_dev)) {
2975 *pret = rbd_try_lock(rbd_dev);
2976 if (*pret)
2977 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
2978 }
2979
2980 lock_state = rbd_dev->lock_state;
2981 up_write(&rbd_dev->lock_rwsem);
2982 return lock_state;
2983}
2984
2985static void rbd_acquire_lock(struct work_struct *work)
2986{
2987 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
2988 struct rbd_device, lock_dwork);
2989 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08002990 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002991
2992 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2993again:
2994 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
2995 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
2996 if (lock_state == RBD_LOCK_STATE_LOCKED)
2997 wake_requests(rbd_dev, true);
2998 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
2999 rbd_dev, lock_state, ret);
3000 return;
3001 }
3002
3003 ret = rbd_request_lock(rbd_dev);
3004 if (ret == -ETIMEDOUT) {
3005 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003006 } else if (ret == -EROFS) {
3007 rbd_warn(rbd_dev, "peer will not release lock");
3008 /*
3009 * If this is rbd_add_acquire_lock(), we want to fail
3010 * immediately -- reuse BLACKLISTED flag. Otherwise we
3011 * want to block.
3012 */
3013 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3014 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3015 /* wake "rbd map --exclusive" process */
3016 wake_requests(rbd_dev, false);
3017 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003018 } else if (ret < 0) {
3019 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3020 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3021 RBD_RETRY_DELAY);
3022 } else {
3023 /*
3024 * lock owner acked, but resend if we don't see them
3025 * release the lock
3026 */
3027 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3028 rbd_dev);
3029 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3030 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3031 }
3032}
3033
3034/*
3035 * lock_rwsem must be held for write
3036 */
3037static bool rbd_release_lock(struct rbd_device *rbd_dev)
3038{
3039 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3040 rbd_dev->lock_state);
3041 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3042 return false;
3043
3044 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3045 downgrade_write(&rbd_dev->lock_rwsem);
3046 /*
3047 * Ensure that all in-flight IO is flushed.
3048 *
3049 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3050 * may be shared with other devices.
3051 */
3052 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3053 up_read(&rbd_dev->lock_rwsem);
3054
3055 down_write(&rbd_dev->lock_rwsem);
3056 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3057 rbd_dev->lock_state);
3058 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3059 return false;
3060
Ilya Dryomovbbead742017-04-13 12:17:38 +02003061 rbd_unlock(rbd_dev);
3062 /*
3063 * Give others a chance to grab the lock - we would re-acquire
3064 * almost immediately if we got new IO during ceph_osdc_sync()
3065 * otherwise. We need to ack our own notifications, so this
3066 * lock_dwork will be requeued from rbd_wait_state_locked()
3067 * after wake_requests() in rbd_handle_released_lock().
3068 */
3069 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003070 return true;
3071}
3072
3073static void rbd_release_lock_work(struct work_struct *work)
3074{
3075 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3076 unlock_work);
3077
3078 down_write(&rbd_dev->lock_rwsem);
3079 rbd_release_lock(rbd_dev);
3080 up_write(&rbd_dev->lock_rwsem);
3081}
3082
3083static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3084 void **p)
3085{
3086 struct rbd_client_id cid = { 0 };
3087
3088 if (struct_v >= 2) {
3089 cid.gid = ceph_decode_64(p);
3090 cid.handle = ceph_decode_64(p);
3091 }
3092
3093 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3094 cid.handle);
3095 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3096 down_write(&rbd_dev->lock_rwsem);
3097 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3098 /*
3099 * we already know that the remote client is
3100 * the owner
3101 */
3102 up_write(&rbd_dev->lock_rwsem);
3103 return;
3104 }
3105
3106 rbd_set_owner_cid(rbd_dev, &cid);
3107 downgrade_write(&rbd_dev->lock_rwsem);
3108 } else {
3109 down_read(&rbd_dev->lock_rwsem);
3110 }
3111
3112 if (!__rbd_is_lock_owner(rbd_dev))
3113 wake_requests(rbd_dev, false);
3114 up_read(&rbd_dev->lock_rwsem);
3115}
3116
3117static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3118 void **p)
3119{
3120 struct rbd_client_id cid = { 0 };
3121
3122 if (struct_v >= 2) {
3123 cid.gid = ceph_decode_64(p);
3124 cid.handle = ceph_decode_64(p);
3125 }
3126
3127 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3128 cid.handle);
3129 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3130 down_write(&rbd_dev->lock_rwsem);
3131 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3132 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3133 __func__, rbd_dev, cid.gid, cid.handle,
3134 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3135 up_write(&rbd_dev->lock_rwsem);
3136 return;
3137 }
3138
3139 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3140 downgrade_write(&rbd_dev->lock_rwsem);
3141 } else {
3142 down_read(&rbd_dev->lock_rwsem);
3143 }
3144
3145 if (!__rbd_is_lock_owner(rbd_dev))
3146 wake_requests(rbd_dev, false);
3147 up_read(&rbd_dev->lock_rwsem);
3148}
3149
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003150/*
3151 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3152 * ResponseMessage is needed.
3153 */
3154static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3155 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003156{
3157 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3158 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003159 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003160
3161 if (struct_v >= 2) {
3162 cid.gid = ceph_decode_64(p);
3163 cid.handle = ceph_decode_64(p);
3164 }
3165
3166 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3167 cid.handle);
3168 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003169 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003170
3171 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003172 if (__rbd_is_lock_owner(rbd_dev)) {
3173 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3174 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3175 goto out_unlock;
3176
3177 /*
3178 * encode ResponseMessage(0) so the peer can detect
3179 * a missing owner
3180 */
3181 result = 0;
3182
3183 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003184 if (!rbd_dev->opts->exclusive) {
3185 dout("%s rbd_dev %p queueing unlock_work\n",
3186 __func__, rbd_dev);
3187 queue_work(rbd_dev->task_wq,
3188 &rbd_dev->unlock_work);
3189 } else {
3190 /* refuse to release the lock */
3191 result = -EROFS;
3192 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003193 }
3194 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003195
3196out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003197 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003198 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003199}
3200
3201static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3202 u64 notify_id, u64 cookie, s32 *result)
3203{
3204 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3205 int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3206 char buf[buf_size];
3207 int ret;
3208
3209 if (result) {
3210 void *p = buf;
3211
3212 /* encode ResponseMessage */
3213 ceph_start_encoding(&p, 1, 1,
3214 buf_size - CEPH_ENCODING_START_BLK_LEN);
3215 ceph_encode_32(&p, *result);
3216 } else {
3217 buf_size = 0;
3218 }
3219
3220 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3221 &rbd_dev->header_oloc, notify_id, cookie,
3222 buf, buf_size);
3223 if (ret)
3224 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3225}
3226
3227static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3228 u64 cookie)
3229{
3230 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3231 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3232}
3233
3234static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3235 u64 notify_id, u64 cookie, s32 result)
3236{
3237 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3238 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3239}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003240
3241static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3242 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003243{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003244 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003245 void *p = data;
3246 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003247 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003248 u32 len;
3249 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003250 int ret;
3251
Ilya Dryomoved95b212016-08-12 16:40:02 +02003252 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3253 __func__, rbd_dev, cookie, notify_id, data_len);
3254 if (data_len) {
3255 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3256 &struct_v, &len);
3257 if (ret) {
3258 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3259 ret);
3260 return;
3261 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003262
Ilya Dryomoved95b212016-08-12 16:40:02 +02003263 notify_op = ceph_decode_32(&p);
3264 } else {
3265 /* legacy notification for header updates */
3266 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3267 len = 0;
3268 }
Alex Elderb8d70032012-11-30 17:53:04 -06003269
Ilya Dryomoved95b212016-08-12 16:40:02 +02003270 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3271 switch (notify_op) {
3272 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3273 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3274 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3275 break;
3276 case RBD_NOTIFY_OP_RELEASED_LOCK:
3277 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3278 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3279 break;
3280 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003281 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3282 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003283 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003284 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003285 else
3286 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3287 break;
3288 case RBD_NOTIFY_OP_HEADER_UPDATE:
3289 ret = rbd_dev_refresh(rbd_dev);
3290 if (ret)
3291 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3292
3293 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3294 break;
3295 default:
3296 if (rbd_is_lock_owner(rbd_dev))
3297 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3298 cookie, -EOPNOTSUPP);
3299 else
3300 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3301 break;
3302 }
Alex Elderb8d70032012-11-30 17:53:04 -06003303}
3304
Ilya Dryomov99d16942016-08-12 16:11:41 +02003305static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3306
Ilya Dryomov922dab62016-05-26 01:15:02 +02003307static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003308{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003309 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003310
Ilya Dryomov922dab62016-05-26 01:15:02 +02003311 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003312
Ilya Dryomoved95b212016-08-12 16:40:02 +02003313 down_write(&rbd_dev->lock_rwsem);
3314 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3315 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003316
Ilya Dryomov99d16942016-08-12 16:11:41 +02003317 mutex_lock(&rbd_dev->watch_mutex);
3318 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3319 __rbd_unregister_watch(rbd_dev);
3320 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003321
Ilya Dryomov99d16942016-08-12 16:11:41 +02003322 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003323 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003324 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003325}
3326
3327/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003328 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003329 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003330static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003331{
3332 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003333 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003334
Ilya Dryomov922dab62016-05-26 01:15:02 +02003335 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003336 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003337
Ilya Dryomov922dab62016-05-26 01:15:02 +02003338 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3339 &rbd_dev->header_oloc, rbd_watch_cb,
3340 rbd_watch_errcb, rbd_dev);
3341 if (IS_ERR(handle))
3342 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003343
Ilya Dryomov922dab62016-05-26 01:15:02 +02003344 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003345 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003346}
3347
Ilya Dryomov99d16942016-08-12 16:11:41 +02003348/*
3349 * watch_mutex must be locked
3350 */
3351static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003352{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003353 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3354 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003355
Ilya Dryomov99d16942016-08-12 16:11:41 +02003356 rbd_assert(rbd_dev->watch_handle);
3357 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003358
Ilya Dryomov922dab62016-05-26 01:15:02 +02003359 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3360 if (ret)
3361 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003362
Ilya Dryomov922dab62016-05-26 01:15:02 +02003363 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003364}
3365
Ilya Dryomov99d16942016-08-12 16:11:41 +02003366static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003367{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003368 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003369
Ilya Dryomov99d16942016-08-12 16:11:41 +02003370 mutex_lock(&rbd_dev->watch_mutex);
3371 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3372 ret = __rbd_register_watch(rbd_dev);
3373 if (ret)
3374 goto out;
3375
3376 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3377 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3378
3379out:
3380 mutex_unlock(&rbd_dev->watch_mutex);
3381 return ret;
3382}
3383
3384static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3385{
3386 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3387
3388 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003389 cancel_work_sync(&rbd_dev->acquired_lock_work);
3390 cancel_work_sync(&rbd_dev->released_lock_work);
3391 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3392 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003393}
3394
3395static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3396{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003397 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003398 cancel_tasks_sync(rbd_dev);
3399
3400 mutex_lock(&rbd_dev->watch_mutex);
3401 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3402 __rbd_unregister_watch(rbd_dev);
3403 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3404 mutex_unlock(&rbd_dev->watch_mutex);
3405
Ilya Dryomov811c6682016-04-15 16:22:16 +02003406 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003407}
3408
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003409/*
3410 * lock_rwsem must be held for write
3411 */
3412static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3413{
3414 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3415 char cookie[32];
3416 int ret;
3417
3418 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3419
3420 format_lock_cookie(rbd_dev, cookie);
3421 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3422 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3423 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3424 RBD_LOCK_TAG, cookie);
3425 if (ret) {
3426 if (ret != -EOPNOTSUPP)
3427 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3428 ret);
3429
3430 /*
3431 * Lock cookie cannot be updated on older OSDs, so do
3432 * a manual release and queue an acquire.
3433 */
3434 if (rbd_release_lock(rbd_dev))
3435 queue_delayed_work(rbd_dev->task_wq,
3436 &rbd_dev->lock_dwork, 0);
3437 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003438 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003439 }
3440}
3441
Ilya Dryomov99d16942016-08-12 16:11:41 +02003442static void rbd_reregister_watch(struct work_struct *work)
3443{
3444 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3445 struct rbd_device, watch_dwork);
3446 int ret;
3447
3448 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3449
3450 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003451 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3452 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003453 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003454 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003455
3456 ret = __rbd_register_watch(rbd_dev);
3457 if (ret) {
3458 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003459 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003460 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003461 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003462 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003463 queue_delayed_work(rbd_dev->task_wq,
3464 &rbd_dev->watch_dwork,
3465 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003466 }
3467 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003468 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003469 }
3470
3471 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3472 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3473 mutex_unlock(&rbd_dev->watch_mutex);
3474
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003475 down_write(&rbd_dev->lock_rwsem);
3476 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3477 rbd_reacquire_lock(rbd_dev);
3478 up_write(&rbd_dev->lock_rwsem);
3479
Ilya Dryomov99d16942016-08-12 16:11:41 +02003480 ret = rbd_dev_refresh(rbd_dev);
3481 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00003482 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003483}
3484
Alex Elder36be9a72013-01-19 00:30:28 -06003485/*
Alex Elderf40eb342013-04-25 15:09:42 -05003486 * Synchronous osd object method call. Returns the number of bytes
3487 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003488 */
3489static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003490 struct ceph_object_id *oid,
3491 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003492 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003493 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003494 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003495 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003496 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003497{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003498 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3499 struct page *req_page = NULL;
3500 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003501 int ret;
3502
3503 /*
Alex Elder6010a452013-04-05 01:27:11 -05003504 * Method calls are ultimately read operations. The result
3505 * should placed into the inbound buffer provided. They
3506 * also supply outbound data--parameters for the object
3507 * method. Currently if this is present it will be a
3508 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003509 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003510 if (outbound) {
3511 if (outbound_size > PAGE_SIZE)
3512 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003513
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003514 req_page = alloc_page(GFP_KERNEL);
3515 if (!req_page)
3516 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003517
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003518 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003519 }
Alex Elder430c28c2013-04-03 21:32:51 -05003520
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003521 reply_page = alloc_page(GFP_KERNEL);
3522 if (!reply_page) {
3523 if (req_page)
3524 __free_page(req_page);
3525 return -ENOMEM;
3526 }
Alex Elder36be9a72013-01-19 00:30:28 -06003527
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003528 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3529 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3530 reply_page, &inbound_size);
3531 if (!ret) {
3532 memcpy(inbound, page_address(reply_page), inbound_size);
3533 ret = inbound_size;
3534 }
Alex Elder57385b52013-04-21 12:14:45 -05003535
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003536 if (req_page)
3537 __free_page(req_page);
3538 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003539 return ret;
3540}
3541
Ilya Dryomoved95b212016-08-12 16:40:02 +02003542/*
3543 * lock_rwsem must be held for read
3544 */
3545static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
3546{
3547 DEFINE_WAIT(wait);
3548
3549 do {
3550 /*
3551 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3552 * and cancel_delayed_work() in wake_requests().
3553 */
3554 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3555 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3556 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3557 TASK_UNINTERRUPTIBLE);
3558 up_read(&rbd_dev->lock_rwsem);
3559 schedule();
3560 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003561 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
3562 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
3563
Ilya Dryomoved95b212016-08-12 16:40:02 +02003564 finish_wait(&rbd_dev->lock_waitq, &wait);
3565}
3566
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003567static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003568{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003569 struct request *rq = blk_mq_rq_from_pdu(work);
3570 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003571 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003572 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003573 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3574 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003575 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003576 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003577 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003578 int result;
3579
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003580 switch (req_op(rq)) {
3581 case REQ_OP_DISCARD:
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003582 case REQ_OP_WRITE_ZEROES:
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003583 op_type = OBJ_OP_DISCARD;
3584 break;
3585 case REQ_OP_WRITE:
3586 op_type = OBJ_OP_WRITE;
3587 break;
3588 case REQ_OP_READ:
3589 op_type = OBJ_OP_READ;
3590 break;
3591 default:
3592 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003593 result = -EIO;
3594 goto err;
3595 }
3596
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003597 /* Ignore/skip any zero-length requests */
3598
3599 if (!length) {
3600 dout("%s: zero-length request\n", __func__);
3601 result = 0;
3602 goto err_rq;
3603 }
3604
Ilya Dryomov9568c932017-10-12 12:35:19 +02003605 rbd_assert(op_type == OBJ_OP_READ ||
3606 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003607
3608 /*
3609 * Quit early if the mapped snapshot no longer exists. It's
3610 * still possible the snapshot will have disappeared by the
3611 * time our request arrives at the osd, but there's no sense in
3612 * sending it if we already know.
3613 */
3614 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3615 dout("request for non-existent snapshot");
3616 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3617 result = -ENXIO;
3618 goto err_rq;
3619 }
3620
3621 if (offset && length > U64_MAX - offset + 1) {
3622 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3623 length);
3624 result = -EINVAL;
3625 goto err_rq; /* Shouldn't happen */
3626 }
3627
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003628 blk_mq_start_request(rq);
3629
Josh Durgin4e752f02014-04-08 11:12:11 -07003630 down_read(&rbd_dev->header_rwsem);
3631 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003632 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003633 snapc = rbd_dev->header.snapc;
3634 ceph_get_snap_context(snapc);
3635 }
3636 up_read(&rbd_dev->header_rwsem);
3637
3638 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003639 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003640 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003641 result = -EIO;
3642 goto err_rq;
3643 }
3644
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003645 must_be_locked =
3646 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3647 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003648 if (must_be_locked) {
3649 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003650 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
Ilya Dryomove010dd02017-04-13 12:17:39 +02003651 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3652 if (rbd_dev->opts->exclusive) {
3653 rbd_warn(rbd_dev, "exclusive lock required");
3654 result = -EROFS;
3655 goto err_unlock;
3656 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003657 rbd_wait_state_locked(rbd_dev);
Ilya Dryomove010dd02017-04-13 12:17:39 +02003658 }
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003659 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3660 result = -EBLACKLISTED;
3661 goto err_unlock;
3662 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003663 }
3664
Ilya Dryomovdfd98752018-02-06 19:26:35 +01003665 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003666 if (!img_request) {
3667 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003668 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003669 }
3670 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003671 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003672
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003673 if (op_type == OBJ_OP_DISCARD)
Ilya Dryomov5a237812018-02-06 19:26:34 +01003674 result = rbd_img_fill_nodata(img_request, offset, length);
3675 else
3676 result = rbd_img_fill_from_bio(img_request, offset, length,
3677 rq->bio);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003678 if (result)
3679 goto err_img_request;
3680
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01003681 rbd_img_request_submit(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003682 if (must_be_locked)
3683 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003684 return;
3685
3686err_img_request:
3687 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003688err_unlock:
3689 if (must_be_locked)
3690 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003691err_rq:
3692 if (result)
3693 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003694 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003695 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003696err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003697 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003698}
3699
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003700static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003701 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003702{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003703 struct request *rq = bd->rq;
3704 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003705
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003706 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003707 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003708}
3709
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003710static void rbd_free_disk(struct rbd_device *rbd_dev)
3711{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003712 blk_cleanup_queue(rbd_dev->disk->queue);
3713 blk_mq_free_tag_set(&rbd_dev->tag_set);
3714 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003715 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003716}
3717
Alex Elder788e2df2013-01-17 12:25:27 -06003718static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003719 struct ceph_object_id *oid,
3720 struct ceph_object_locator *oloc,
3721 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003722
3723{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003724 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3725 struct ceph_osd_request *req;
3726 struct page **pages;
3727 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003728 int ret;
3729
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003730 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3731 if (!req)
3732 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003733
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003734 ceph_oid_copy(&req->r_base_oid, oid);
3735 ceph_oloc_copy(&req->r_base_oloc, oloc);
3736 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003737
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003738 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06003739 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003740 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06003741
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003742 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3743 if (IS_ERR(pages)) {
3744 ret = PTR_ERR(pages);
3745 goto out_req;
3746 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003747
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003748 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3749 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3750 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003751
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003752 ceph_osdc_start_request(osdc, req, false);
3753 ret = ceph_osdc_wait_request(osdc, req);
3754 if (ret >= 0)
3755 ceph_copy_from_page_vector(pages, buf, 0, ret);
3756
3757out_req:
3758 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003759 return ret;
3760}
3761
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003762/*
Alex Elder662518b2013-05-06 09:51:29 -05003763 * Read the complete header for the given rbd device. On successful
3764 * return, the rbd_dev->header field will contain up-to-date
3765 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003766 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003767static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003768{
3769 struct rbd_image_header_ondisk *ondisk = NULL;
3770 u32 snap_count = 0;
3771 u64 names_size = 0;
3772 u32 want_count;
3773 int ret;
3774
3775 /*
3776 * The complete header will include an array of its 64-bit
3777 * snapshot ids, followed by the names of those snapshots as
3778 * a contiguous block of NUL-terminated strings. Note that
3779 * the number of snapshots could change by the time we read
3780 * it in, in which case we re-read it.
3781 */
3782 do {
3783 size_t size;
3784
3785 kfree(ondisk);
3786
3787 size = sizeof (*ondisk);
3788 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3789 size += names_size;
3790 ondisk = kmalloc(size, GFP_KERNEL);
3791 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003792 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003793
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003794 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3795 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05003796 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003797 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003798 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003799 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003800 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3801 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003802 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003803 }
3804 if (!rbd_dev_ondisk_valid(ondisk)) {
3805 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003806 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003807 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003808 }
3809
3810 names_size = le64_to_cpu(ondisk->snap_names_len);
3811 want_count = snap_count;
3812 snap_count = le32_to_cpu(ondisk->snap_count);
3813 } while (snap_count != want_count);
3814
Alex Elder662518b2013-05-06 09:51:29 -05003815 ret = rbd_header_from_disk(rbd_dev, ondisk);
3816out:
Alex Elder4156d992012-08-02 11:29:46 -05003817 kfree(ondisk);
3818
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003819 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003820}
3821
Alex Elder15228ed2013-05-01 12:43:03 -05003822/*
3823 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3824 * has disappeared from the (just updated) snapshot context.
3825 */
3826static void rbd_exists_validate(struct rbd_device *rbd_dev)
3827{
3828 u64 snap_id;
3829
3830 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3831 return;
3832
3833 snap_id = rbd_dev->spec->snap_id;
3834 if (snap_id == CEPH_NOSNAP)
3835 return;
3836
3837 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3838 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3839}
3840
Josh Durgin98752012013-08-29 17:26:31 -07003841static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3842{
3843 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07003844
3845 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02003846 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3847 * try to update its size. If REMOVING is set, updating size
3848 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07003849 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02003850 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3851 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07003852 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3853 dout("setting size to %llu sectors", (unsigned long long)size);
3854 set_capacity(rbd_dev->disk, size);
3855 revalidate_disk(rbd_dev->disk);
3856 }
3857}
3858
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003859static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003860{
Alex Eldere627db02013-05-06 07:40:30 -05003861 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003862 int ret;
3863
Alex Eldercfbf6372013-05-31 17:40:45 -05003864 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05003865 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04003866
3867 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003868 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003869 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05003870
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003871 /*
3872 * If there is a parent, see if it has disappeared due to the
3873 * mapped image getting flattened.
3874 */
3875 if (rbd_dev->parent) {
3876 ret = rbd_dev_v2_parent_info(rbd_dev);
3877 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003878 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003879 }
3880
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003881 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003882 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003883 } else {
3884 /* validate mapped snapshot's EXISTS flag */
3885 rbd_exists_validate(rbd_dev);
3886 }
Alex Elder15228ed2013-05-01 12:43:03 -05003887
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003888out:
Alex Eldercfbf6372013-05-31 17:40:45 -05003889 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003890 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07003891 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003892
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003893 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05003894}
3895
Christoph Hellwigd6296d32017-05-01 10:19:08 -06003896static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
3897 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003898{
3899 struct work_struct *work = blk_mq_rq_to_pdu(rq);
3900
3901 INIT_WORK(work, rbd_queue_workfn);
3902 return 0;
3903}
3904
Eric Biggersf363b082017-03-30 13:39:16 -07003905static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003906 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003907 .init_request = rbd_init_request,
3908};
3909
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003910static int rbd_init_disk(struct rbd_device *rbd_dev)
3911{
3912 struct gendisk *disk;
3913 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003914 u64 segment_size;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003915 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003916
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003917 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003918 disk = alloc_disk(single_major ?
3919 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3920 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003921 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003922 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003923
Alex Elderf0f8cef2012-01-29 13:57:44 -06003924 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003925 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003926 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003927 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003928 if (single_major)
3929 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003930 disk->fops = &rbd_bd_ops;
3931 disk->private_data = rbd_dev;
3932
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003933 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
3934 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003935 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003936 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003937 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003938 rbd_dev->tag_set.nr_hw_queues = 1;
3939 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
3940
3941 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
3942 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003943 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003944
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003945 q = blk_mq_init_queue(&rbd_dev->tag_set);
3946 if (IS_ERR(q)) {
3947 err = PTR_ERR(q);
3948 goto out_tag_set;
3949 }
3950
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03003951 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
3952 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06003953
Josh Durgin029bcbd2011-07-22 11:35:23 -07003954 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003955 segment_size = rbd_obj_bytes(&rbd_dev->header);
3956 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02003957 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01003958 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01003959 blk_queue_max_segment_size(q, UINT_MAX);
Alex Elder593a9e72012-02-07 12:03:37 -06003960 blk_queue_io_min(q, segment_size);
3961 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003962
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003963 /* enable the discard support */
3964 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
3965 q->limits.discard_granularity = segment_size;
Jens Axboe2bb4cd52015-07-14 08:15:12 -06003966 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003967 blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003968
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003969 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01003970 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003971
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003972 /*
3973 * disk_release() expects a queue ref from add_disk() and will
3974 * put it. Hold an extra ref until add_disk() is called.
3975 */
3976 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003977 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003978 q->queuedata = rbd_dev;
3979
3980 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003981
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003982 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003983out_tag_set:
3984 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003985out_disk:
3986 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003987 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003988}
3989
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003990/*
3991 sysfs
3992*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003993
Alex Elder593a9e72012-02-07 12:03:37 -06003994static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3995{
3996 return container_of(dev, struct rbd_device, dev);
3997}
3998
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003999static ssize_t rbd_size_show(struct device *dev,
4000 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004001{
Alex Elder593a9e72012-02-07 12:03:37 -06004002 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004003
Alex Elderfc71d832013-04-26 15:44:36 -05004004 return sprintf(buf, "%llu\n",
4005 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004006}
4007
Alex Elder34b13182012-07-13 20:35:12 -05004008/*
4009 * Note this shows the features for whatever's mapped, which is not
4010 * necessarily the base image.
4011 */
4012static ssize_t rbd_features_show(struct device *dev,
4013 struct device_attribute *attr, char *buf)
4014{
4015 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4016
4017 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004018 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004019}
4020
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004021static ssize_t rbd_major_show(struct device *dev,
4022 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004023{
Alex Elder593a9e72012-02-07 12:03:37 -06004024 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004025
Alex Elderfc71d832013-04-26 15:44:36 -05004026 if (rbd_dev->major)
4027 return sprintf(buf, "%d\n", rbd_dev->major);
4028
4029 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004030}
Alex Elderfc71d832013-04-26 15:44:36 -05004031
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004032static ssize_t rbd_minor_show(struct device *dev,
4033 struct device_attribute *attr, char *buf)
4034{
4035 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4036
4037 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004038}
4039
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004040static ssize_t rbd_client_addr_show(struct device *dev,
4041 struct device_attribute *attr, char *buf)
4042{
4043 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4044 struct ceph_entity_addr *client_addr =
4045 ceph_client_addr(rbd_dev->rbd_client->client);
4046
4047 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4048 le32_to_cpu(client_addr->nonce));
4049}
4050
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004051static ssize_t rbd_client_id_show(struct device *dev,
4052 struct device_attribute *attr, char *buf)
4053{
Alex Elder593a9e72012-02-07 12:03:37 -06004054 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004055
Alex Elder1dbb4392012-01-24 10:08:37 -06004056 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004057 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004058}
4059
Mike Christie267fb902016-08-18 18:38:43 +02004060static ssize_t rbd_cluster_fsid_show(struct device *dev,
4061 struct device_attribute *attr, char *buf)
4062{
4063 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4064
4065 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4066}
4067
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004068static ssize_t rbd_config_info_show(struct device *dev,
4069 struct device_attribute *attr, char *buf)
4070{
4071 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4072
4073 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004074}
4075
4076static ssize_t rbd_pool_show(struct device *dev,
4077 struct device_attribute *attr, char *buf)
4078{
Alex Elder593a9e72012-02-07 12:03:37 -06004079 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004080
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004081 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004082}
4083
Alex Elder9bb2f332012-07-12 10:46:35 -05004084static ssize_t rbd_pool_id_show(struct device *dev,
4085 struct device_attribute *attr, char *buf)
4086{
4087 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4088
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004089 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004090 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004091}
4092
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004093static ssize_t rbd_name_show(struct device *dev,
4094 struct device_attribute *attr, char *buf)
4095{
Alex Elder593a9e72012-02-07 12:03:37 -06004096 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004097
Alex Eldera92ffdf2012-10-30 19:40:33 -05004098 if (rbd_dev->spec->image_name)
4099 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4100
4101 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004102}
4103
Alex Elder589d30e2012-07-10 20:30:11 -05004104static ssize_t rbd_image_id_show(struct device *dev,
4105 struct device_attribute *attr, char *buf)
4106{
4107 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4108
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004109 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004110}
4111
Alex Elder34b13182012-07-13 20:35:12 -05004112/*
4113 * Shows the name of the currently-mapped snapshot (or
4114 * RBD_SNAP_HEAD_NAME for the base image).
4115 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004116static ssize_t rbd_snap_show(struct device *dev,
4117 struct device_attribute *attr,
4118 char *buf)
4119{
Alex Elder593a9e72012-02-07 12:03:37 -06004120 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004121
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004122 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004123}
4124
Mike Christie92a58672016-08-18 18:38:44 +02004125static ssize_t rbd_snap_id_show(struct device *dev,
4126 struct device_attribute *attr, char *buf)
4127{
4128 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4129
4130 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4131}
4132
Alex Elder86b00e02012-10-25 23:34:42 -05004133/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004134 * For a v2 image, shows the chain of parent images, separated by empty
4135 * lines. For v1 images or if there is no parent, shows "(no parent
4136 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004137 */
4138static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004139 struct device_attribute *attr,
4140 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004141{
4142 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004143 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004144
Ilya Dryomovff961282014-07-22 21:53:07 +04004145 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004146 return sprintf(buf, "(no parent image)\n");
4147
Ilya Dryomovff961282014-07-22 21:53:07 +04004148 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4149 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004150
Ilya Dryomovff961282014-07-22 21:53:07 +04004151 count += sprintf(&buf[count], "%s"
4152 "pool_id %llu\npool_name %s\n"
4153 "image_id %s\nimage_name %s\n"
4154 "snap_id %llu\nsnap_name %s\n"
4155 "overlap %llu\n",
4156 !count ? "" : "\n", /* first? */
4157 spec->pool_id, spec->pool_name,
4158 spec->image_id, spec->image_name ?: "(unknown)",
4159 spec->snap_id, spec->snap_name,
4160 rbd_dev->parent_overlap);
4161 }
Alex Elder86b00e02012-10-25 23:34:42 -05004162
Ilya Dryomovff961282014-07-22 21:53:07 +04004163 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004164}
4165
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004166static ssize_t rbd_image_refresh(struct device *dev,
4167 struct device_attribute *attr,
4168 const char *buf,
4169 size_t size)
4170{
Alex Elder593a9e72012-02-07 12:03:37 -06004171 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004172 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004173
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004174 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004175 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004176 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004177
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004178 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004179}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004180
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004181static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05004182static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004183static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004184static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004185static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004186static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
Mike Christie267fb902016-08-18 18:38:43 +02004187static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004188static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004189static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05004190static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004191static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05004192static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004193static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4194static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Mike Christie92a58672016-08-18 18:38:44 +02004195static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05004196static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004197
4198static struct attribute *rbd_attrs[] = {
4199 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004200 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004201 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004202 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004203 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004204 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004205 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004206 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004207 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004208 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004209 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004210 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004211 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004212 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004213 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004214 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004215 NULL
4216};
4217
4218static struct attribute_group rbd_attr_group = {
4219 .attrs = rbd_attrs,
4220};
4221
4222static const struct attribute_group *rbd_attr_groups[] = {
4223 &rbd_attr_group,
4224 NULL
4225};
4226
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004227static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004228
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304229static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004230 .name = "rbd",
4231 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004232 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004233};
4234
Alex Elder8b8fb992012-10-26 17:25:24 -05004235static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4236{
4237 kref_get(&spec->kref);
4238
4239 return spec;
4240}
4241
4242static void rbd_spec_free(struct kref *kref);
4243static void rbd_spec_put(struct rbd_spec *spec)
4244{
4245 if (spec)
4246 kref_put(&spec->kref, rbd_spec_free);
4247}
4248
4249static struct rbd_spec *rbd_spec_alloc(void)
4250{
4251 struct rbd_spec *spec;
4252
4253 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4254 if (!spec)
4255 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004256
4257 spec->pool_id = CEPH_NOPOOL;
4258 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004259 kref_init(&spec->kref);
4260
Alex Elder8b8fb992012-10-26 17:25:24 -05004261 return spec;
4262}
4263
4264static void rbd_spec_free(struct kref *kref)
4265{
4266 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4267
4268 kfree(spec->pool_name);
4269 kfree(spec->image_id);
4270 kfree(spec->image_name);
4271 kfree(spec->snap_name);
4272 kfree(spec);
4273}
4274
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004275static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004276{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004277 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004278 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004279
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004280 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004281 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004282 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004283
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004284 rbd_put_client(rbd_dev->rbd_client);
4285 rbd_spec_put(rbd_dev->spec);
4286 kfree(rbd_dev->opts);
4287 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004288}
4289
4290static void rbd_dev_release(struct device *dev)
4291{
4292 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4293 bool need_put = !!rbd_dev->opts;
4294
4295 if (need_put) {
4296 destroy_workqueue(rbd_dev->task_wq);
4297 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4298 }
4299
4300 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004301
4302 /*
4303 * This is racy, but way better than putting module outside of
4304 * the release callback. The race window is pretty small, so
4305 * doing something similar to dm (dm-builtin.c) is overkill.
4306 */
4307 if (need_put)
4308 module_put(THIS_MODULE);
4309}
4310
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004311static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4312 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004313{
4314 struct rbd_device *rbd_dev;
4315
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004316 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004317 if (!rbd_dev)
4318 return NULL;
4319
4320 spin_lock_init(&rbd_dev->lock);
4321 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004322 init_rwsem(&rbd_dev->header_rwsem);
4323
Ilya Dryomov7e973322017-01-25 18:16:22 +01004324 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004325 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004326 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004327
Ilya Dryomov99d16942016-08-12 16:11:41 +02004328 mutex_init(&rbd_dev->watch_mutex);
4329 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4330 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4331
Ilya Dryomoved95b212016-08-12 16:40:02 +02004332 init_rwsem(&rbd_dev->lock_rwsem);
4333 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4334 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4335 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4336 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4337 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4338 init_waitqueue_head(&rbd_dev->lock_waitq);
4339
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004340 rbd_dev->dev.bus = &rbd_bus_type;
4341 rbd_dev->dev.type = &rbd_device_type;
4342 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004343 device_initialize(&rbd_dev->dev);
4344
Alex Elderc53d5892012-10-25 23:34:42 -05004345 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004346 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004347
Alex Elderc53d5892012-10-25 23:34:42 -05004348 return rbd_dev;
4349}
4350
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004351/*
4352 * Create a mapping rbd_dev.
4353 */
4354static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4355 struct rbd_spec *spec,
4356 struct rbd_options *opts)
4357{
4358 struct rbd_device *rbd_dev;
4359
4360 rbd_dev = __rbd_dev_create(rbdc, spec);
4361 if (!rbd_dev)
4362 return NULL;
4363
4364 rbd_dev->opts = opts;
4365
4366 /* get an id and fill in device name */
4367 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4368 minor_to_rbd_dev_id(1 << MINORBITS),
4369 GFP_KERNEL);
4370 if (rbd_dev->dev_id < 0)
4371 goto fail_rbd_dev;
4372
4373 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4374 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4375 rbd_dev->name);
4376 if (!rbd_dev->task_wq)
4377 goto fail_dev_id;
4378
4379 /* we have a ref from do_rbd_add() */
4380 __module_get(THIS_MODULE);
4381
4382 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4383 return rbd_dev;
4384
4385fail_dev_id:
4386 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4387fail_rbd_dev:
4388 rbd_dev_free(rbd_dev);
4389 return NULL;
4390}
4391
Alex Elderc53d5892012-10-25 23:34:42 -05004392static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4393{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004394 if (rbd_dev)
4395 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004396}
4397
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004398/*
Alex Elder9d475de2012-07-03 16:01:19 -05004399 * Get the size and object order for an image snapshot, or if
4400 * snap_id is CEPH_NOSNAP, gets this information for the base
4401 * image.
4402 */
4403static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4404 u8 *order, u64 *snap_size)
4405{
4406 __le64 snapid = cpu_to_le64(snap_id);
4407 int ret;
4408 struct {
4409 u8 order;
4410 __le64 size;
4411 } __attribute__ ((packed)) size_buf = { 0 };
4412
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004413 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4414 &rbd_dev->header_oloc, "get_size",
4415 &snapid, sizeof(snapid),
4416 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004417 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004418 if (ret < 0)
4419 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004420 if (ret < sizeof (size_buf))
4421 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004422
Josh Durginc3545572013-08-28 17:08:10 -07004423 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004424 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004425 dout(" order %u", (unsigned int)*order);
4426 }
Alex Elder9d475de2012-07-03 16:01:19 -05004427 *snap_size = le64_to_cpu(size_buf.size);
4428
Josh Durginc3545572013-08-28 17:08:10 -07004429 dout(" snap_id 0x%016llx snap_size = %llu\n",
4430 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004431 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004432
4433 return 0;
4434}
4435
4436static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4437{
4438 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4439 &rbd_dev->header.obj_order,
4440 &rbd_dev->header.image_size);
4441}
4442
Alex Elder1e130192012-07-03 16:01:19 -05004443static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4444{
4445 void *reply_buf;
4446 int ret;
4447 void *p;
4448
4449 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4450 if (!reply_buf)
4451 return -ENOMEM;
4452
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004453 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4454 &rbd_dev->header_oloc, "get_object_prefix",
4455 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004456 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004457 if (ret < 0)
4458 goto out;
4459
4460 p = reply_buf;
4461 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004462 p + ret, NULL, GFP_NOIO);
4463 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004464
4465 if (IS_ERR(rbd_dev->header.object_prefix)) {
4466 ret = PTR_ERR(rbd_dev->header.object_prefix);
4467 rbd_dev->header.object_prefix = NULL;
4468 } else {
4469 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4470 }
Alex Elder1e130192012-07-03 16:01:19 -05004471out:
4472 kfree(reply_buf);
4473
4474 return ret;
4475}
4476
Alex Elderb1b54022012-07-03 16:01:19 -05004477static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4478 u64 *snap_features)
4479{
4480 __le64 snapid = cpu_to_le64(snap_id);
4481 struct {
4482 __le64 features;
4483 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004484 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004485 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004486 int ret;
4487
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004488 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4489 &rbd_dev->header_oloc, "get_features",
4490 &snapid, sizeof(snapid),
4491 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004492 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004493 if (ret < 0)
4494 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004495 if (ret < sizeof (features_buf))
4496 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004497
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004498 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4499 if (unsup) {
4500 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4501 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004502 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004503 }
Alex Elderd8891402012-10-09 13:50:17 -07004504
Alex Elderb1b54022012-07-03 16:01:19 -05004505 *snap_features = le64_to_cpu(features_buf.features);
4506
4507 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004508 (unsigned long long)snap_id,
4509 (unsigned long long)*snap_features,
4510 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004511
4512 return 0;
4513}
4514
4515static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4516{
4517 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4518 &rbd_dev->header.features);
4519}
4520
Alex Elder86b00e02012-10-25 23:34:42 -05004521static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4522{
4523 struct rbd_spec *parent_spec;
4524 size_t size;
4525 void *reply_buf = NULL;
4526 __le64 snapid;
4527 void *p;
4528 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05004529 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004530 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004531 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004532 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05004533 int ret;
4534
4535 parent_spec = rbd_spec_alloc();
4536 if (!parent_spec)
4537 return -ENOMEM;
4538
4539 size = sizeof (__le64) + /* pool_id */
4540 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
4541 sizeof (__le64) + /* snap_id */
4542 sizeof (__le64); /* overlap */
4543 reply_buf = kmalloc(size, GFP_KERNEL);
4544 if (!reply_buf) {
4545 ret = -ENOMEM;
4546 goto out_err;
4547 }
4548
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04004549 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004550 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4551 &rbd_dev->header_oloc, "get_parent",
4552 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004553 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05004554 if (ret < 0)
4555 goto out_err;
4556
Alex Elder86b00e02012-10-25 23:34:42 -05004557 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004558 end = reply_buf + ret;
4559 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05004560 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05004561 if (pool_id == CEPH_NOPOOL) {
4562 /*
4563 * Either the parent never existed, or we have
4564 * record of it but the image got flattened so it no
4565 * longer has a parent. When the parent of a
4566 * layered image disappears we immediately set the
4567 * overlap to 0. The effect of this is that all new
4568 * requests will be treated as if the image had no
4569 * parent.
4570 */
4571 if (rbd_dev->parent_overlap) {
4572 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004573 rbd_dev_parent_put(rbd_dev);
4574 pr_info("%s: clone image has been flattened\n",
4575 rbd_dev->disk->disk_name);
4576 }
4577
Alex Elder86b00e02012-10-25 23:34:42 -05004578 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004579 }
Alex Elder86b00e02012-10-25 23:34:42 -05004580
Alex Elder0903e872012-11-14 12:25:19 -06004581 /* The ceph file layout needs to fit pool id in 32 bits */
4582
4583 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05004584 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004585 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05004586 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004587 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004588 }
Alex Elder0903e872012-11-14 12:25:19 -06004589
Alex Elder979ed482012-11-01 08:39:26 -05004590 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05004591 if (IS_ERR(image_id)) {
4592 ret = PTR_ERR(image_id);
4593 goto out_err;
4594 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004595 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05004596 ceph_decode_64_safe(&p, end, overlap, out_err);
4597
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004598 /*
4599 * The parent won't change (except when the clone is
4600 * flattened, already handled that). So we only need to
4601 * record the parent spec we have not already done so.
4602 */
4603 if (!rbd_dev->parent_spec) {
4604 parent_spec->pool_id = pool_id;
4605 parent_spec->image_id = image_id;
4606 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05004607 rbd_dev->parent_spec = parent_spec;
4608 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04004609 } else {
4610 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004611 }
4612
4613 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004614 * We always update the parent overlap. If it's zero we issue
4615 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004616 */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004617 if (!overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004618 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004619 /* refresh, careful to warn just once */
4620 if (rbd_dev->parent_overlap)
4621 rbd_warn(rbd_dev,
4622 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004623 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004624 /* initial probe */
4625 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004626 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004627 }
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004628 rbd_dev->parent_overlap = overlap;
4629
Alex Elder86b00e02012-10-25 23:34:42 -05004630out:
4631 ret = 0;
4632out_err:
4633 kfree(reply_buf);
4634 rbd_spec_put(parent_spec);
4635
4636 return ret;
4637}
4638
Alex Eldercc070d52013-04-21 12:14:45 -05004639static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4640{
4641 struct {
4642 __le64 stripe_unit;
4643 __le64 stripe_count;
4644 } __attribute__ ((packed)) striping_info_buf = { 0 };
4645 size_t size = sizeof (striping_info_buf);
4646 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05004647 int ret;
4648
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004649 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4650 &rbd_dev->header_oloc, "get_stripe_unit_count",
4651 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004652 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4653 if (ret < 0)
4654 return ret;
4655 if (ret < size)
4656 return -ERANGE;
4657
Alex Eldercc070d52013-04-21 12:14:45 -05004658 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01004659 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
4660 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05004661 return 0;
4662}
4663
Ilya Dryomov7e973322017-01-25 18:16:22 +01004664static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4665{
4666 __le64 data_pool_id;
4667 int ret;
4668
4669 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4670 &rbd_dev->header_oloc, "get_data_pool",
4671 NULL, 0, &data_pool_id, sizeof(data_pool_id));
4672 if (ret < 0)
4673 return ret;
4674 if (ret < sizeof(data_pool_id))
4675 return -EBADMSG;
4676
4677 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4678 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4679 return 0;
4680}
4681
Alex Elder9e15b772012-10-30 19:40:33 -05004682static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4683{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004684 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05004685 size_t image_id_size;
4686 char *image_id;
4687 void *p;
4688 void *end;
4689 size_t size;
4690 void *reply_buf = NULL;
4691 size_t len = 0;
4692 char *image_name = NULL;
4693 int ret;
4694
4695 rbd_assert(!rbd_dev->spec->image_name);
4696
Alex Elder69e7a022012-11-01 08:39:26 -05004697 len = strlen(rbd_dev->spec->image_id);
4698 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004699 image_id = kmalloc(image_id_size, GFP_KERNEL);
4700 if (!image_id)
4701 return NULL;
4702
4703 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004704 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004705 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004706
4707 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4708 reply_buf = kmalloc(size, GFP_KERNEL);
4709 if (!reply_buf)
4710 goto out;
4711
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004712 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4713 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
4714 "dir_get_name", image_id, image_id_size,
4715 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05004716 if (ret < 0)
4717 goto out;
4718 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004719 end = reply_buf + ret;
4720
Alex Elder9e15b772012-10-30 19:40:33 -05004721 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4722 if (IS_ERR(image_name))
4723 image_name = NULL;
4724 else
4725 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4726out:
4727 kfree(reply_buf);
4728 kfree(image_id);
4729
4730 return image_name;
4731}
4732
Alex Elder2ad3d712013-04-30 00:44:33 -05004733static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4734{
4735 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4736 const char *snap_name;
4737 u32 which = 0;
4738
4739 /* Skip over names until we find the one we are looking for */
4740
4741 snap_name = rbd_dev->header.snap_names;
4742 while (which < snapc->num_snaps) {
4743 if (!strcmp(name, snap_name))
4744 return snapc->snaps[which];
4745 snap_name += strlen(snap_name) + 1;
4746 which++;
4747 }
4748 return CEPH_NOSNAP;
4749}
4750
4751static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4752{
4753 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4754 u32 which;
4755 bool found = false;
4756 u64 snap_id;
4757
4758 for (which = 0; !found && which < snapc->num_snaps; which++) {
4759 const char *snap_name;
4760
4761 snap_id = snapc->snaps[which];
4762 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07004763 if (IS_ERR(snap_name)) {
4764 /* ignore no-longer existing snapshots */
4765 if (PTR_ERR(snap_name) == -ENOENT)
4766 continue;
4767 else
4768 break;
4769 }
Alex Elder2ad3d712013-04-30 00:44:33 -05004770 found = !strcmp(name, snap_name);
4771 kfree(snap_name);
4772 }
4773 return found ? snap_id : CEPH_NOSNAP;
4774}
4775
4776/*
4777 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4778 * no snapshot by that name is found, or if an error occurs.
4779 */
4780static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4781{
4782 if (rbd_dev->image_format == 1)
4783 return rbd_v1_snap_id_by_name(rbd_dev, name);
4784
4785 return rbd_v2_snap_id_by_name(rbd_dev, name);
4786}
4787
Alex Elder9e15b772012-10-30 19:40:33 -05004788/*
Ilya Dryomov04077592014-07-23 17:11:20 +04004789 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05004790 */
Ilya Dryomov04077592014-07-23 17:11:20 +04004791static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
4792{
4793 struct rbd_spec *spec = rbd_dev->spec;
4794
4795 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
4796 rbd_assert(spec->image_id && spec->image_name);
4797 rbd_assert(spec->snap_name);
4798
4799 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4800 u64 snap_id;
4801
4802 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4803 if (snap_id == CEPH_NOSNAP)
4804 return -ENOENT;
4805
4806 spec->snap_id = snap_id;
4807 } else {
4808 spec->snap_id = CEPH_NOSNAP;
4809 }
4810
4811 return 0;
4812}
4813
4814/*
4815 * A parent image will have all ids but none of the names.
4816 *
4817 * All names in an rbd spec are dynamically allocated. It's OK if we
4818 * can't figure out the name for an image id.
4819 */
4820static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05004821{
Alex Elder2e9f7f12013-04-26 09:43:48 -05004822 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4823 struct rbd_spec *spec = rbd_dev->spec;
4824 const char *pool_name;
4825 const char *image_name;
4826 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004827 int ret;
4828
Ilya Dryomov04077592014-07-23 17:11:20 +04004829 rbd_assert(spec->pool_id != CEPH_NOPOOL);
4830 rbd_assert(spec->image_id);
4831 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05004832
Alex Elder2e9f7f12013-04-26 09:43:48 -05004833 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05004834
Alex Elder2e9f7f12013-04-26 09:43:48 -05004835 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4836 if (!pool_name) {
4837 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05004838 return -EIO;
4839 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05004840 pool_name = kstrdup(pool_name, GFP_KERNEL);
4841 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05004842 return -ENOMEM;
4843
4844 /* Fetch the image name; tolerate failure here */
4845
Alex Elder2e9f7f12013-04-26 09:43:48 -05004846 image_name = rbd_dev_image_name(rbd_dev);
4847 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05004848 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05004849
Ilya Dryomov04077592014-07-23 17:11:20 +04004850 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05004851
Alex Elder2e9f7f12013-04-26 09:43:48 -05004852 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07004853 if (IS_ERR(snap_name)) {
4854 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004855 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05004856 }
4857
4858 spec->pool_name = pool_name;
4859 spec->image_name = image_name;
4860 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004861
4862 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04004863
Alex Elder9e15b772012-10-30 19:40:33 -05004864out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05004865 kfree(image_name);
4866 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004867 return ret;
4868}
4869
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004870static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05004871{
4872 size_t size;
4873 int ret;
4874 void *reply_buf;
4875 void *p;
4876 void *end;
4877 u64 seq;
4878 u32 snap_count;
4879 struct ceph_snap_context *snapc;
4880 u32 i;
4881
4882 /*
4883 * We'll need room for the seq value (maximum snapshot id),
4884 * snapshot count, and array of that many snapshot ids.
4885 * For now we have a fixed upper limit on the number we're
4886 * prepared to receive.
4887 */
4888 size = sizeof (__le64) + sizeof (__le32) +
4889 RBD_MAX_SNAP_COUNT * sizeof (__le64);
4890 reply_buf = kzalloc(size, GFP_KERNEL);
4891 if (!reply_buf)
4892 return -ENOMEM;
4893
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004894 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4895 &rbd_dev->header_oloc, "get_snapcontext",
4896 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004897 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05004898 if (ret < 0)
4899 goto out;
4900
Alex Elder35d489f2012-07-03 16:01:19 -05004901 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004902 end = reply_buf + ret;
4903 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05004904 ceph_decode_64_safe(&p, end, seq, out);
4905 ceph_decode_32_safe(&p, end, snap_count, out);
4906
4907 /*
4908 * Make sure the reported number of snapshot ids wouldn't go
4909 * beyond the end of our buffer. But before checking that,
4910 * make sure the computed size of the snapshot context we
4911 * allocate is representable in a size_t.
4912 */
4913 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4914 / sizeof (u64)) {
4915 ret = -EINVAL;
4916 goto out;
4917 }
4918 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4919 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05004920 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05004921
Alex Elder812164f82013-04-30 00:44:32 -05004922 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05004923 if (!snapc) {
4924 ret = -ENOMEM;
4925 goto out;
4926 }
Alex Elder35d489f2012-07-03 16:01:19 -05004927 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05004928 for (i = 0; i < snap_count; i++)
4929 snapc->snaps[i] = ceph_decode_64(&p);
4930
Alex Elder49ece552013-05-06 08:37:00 -05004931 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05004932 rbd_dev->header.snapc = snapc;
4933
4934 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05004935 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05004936out:
4937 kfree(reply_buf);
4938
Alex Elder57385b52013-04-21 12:14:45 -05004939 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05004940}
4941
Alex Elder54cac612013-04-30 00:44:33 -05004942static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4943 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004944{
4945 size_t size;
4946 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05004947 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004948 int ret;
4949 void *p;
4950 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004951 char *snap_name;
4952
4953 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4954 reply_buf = kmalloc(size, GFP_KERNEL);
4955 if (!reply_buf)
4956 return ERR_PTR(-ENOMEM);
4957
Alex Elder54cac612013-04-30 00:44:33 -05004958 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004959 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4960 &rbd_dev->header_oloc, "get_snapshot_name",
4961 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004962 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05004963 if (ret < 0) {
4964 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004965 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05004966 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004967
4968 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004969 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05004970 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05004971 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004972 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004973
Alex Elderf40eb342013-04-25 15:09:42 -05004974 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05004975 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004976out:
4977 kfree(reply_buf);
4978
Alex Elderf40eb342013-04-25 15:09:42 -05004979 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004980}
4981
Alex Elder2df3fac2013-05-06 09:51:30 -05004982static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05004983{
Alex Elder2df3fac2013-05-06 09:51:30 -05004984 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05004985 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05004986
Josh Durgin1617e402013-06-12 14:43:10 -07004987 ret = rbd_dev_v2_image_size(rbd_dev);
4988 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05004989 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07004990
Alex Elder2df3fac2013-05-06 09:51:30 -05004991 if (first_time) {
4992 ret = rbd_dev_v2_header_onetime(rbd_dev);
4993 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05004994 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05004995 }
4996
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004997 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03004998 if (ret && first_time) {
4999 kfree(rbd_dev->header.object_prefix);
5000 rbd_dev->header.object_prefix = NULL;
5001 }
Alex Elder117973f2012-08-31 17:29:55 -05005002
5003 return ret;
5004}
5005
Ilya Dryomova720ae02014-07-23 17:11:19 +04005006static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5007{
5008 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5009
5010 if (rbd_dev->image_format == 1)
5011 return rbd_dev_v1_header_info(rbd_dev);
5012
5013 return rbd_dev_v2_header_info(rbd_dev);
5014}
5015
Alex Elder1ddbe942012-01-29 13:57:44 -06005016/*
Alex Eldere28fff262012-02-02 08:13:30 -06005017 * Skips over white space at *buf, and updates *buf to point to the
5018 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005019 * the token (string of non-white space characters) found. Note
5020 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005021 */
5022static inline size_t next_token(const char **buf)
5023{
5024 /*
5025 * These are the characters that produce nonzero for
5026 * isspace() in the "C" and "POSIX" locales.
5027 */
5028 const char *spaces = " \f\n\r\t\v";
5029
5030 *buf += strspn(*buf, spaces); /* Find start of token */
5031
5032 return strcspn(*buf, spaces); /* Return token length */
5033}
5034
5035/*
Alex Elderea3352f2012-07-09 21:04:23 -05005036 * Finds the next token in *buf, dynamically allocates a buffer big
5037 * enough to hold a copy of it, and copies the token into the new
5038 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5039 * that a duplicate buffer is created even for a zero-length token.
5040 *
5041 * Returns a pointer to the newly-allocated duplicate, or a null
5042 * pointer if memory for the duplicate was not available. If
5043 * the lenp argument is a non-null pointer, the length of the token
5044 * (not including the '\0') is returned in *lenp.
5045 *
5046 * If successful, the *buf pointer will be updated to point beyond
5047 * the end of the found token.
5048 *
5049 * Note: uses GFP_KERNEL for allocation.
5050 */
5051static inline char *dup_token(const char **buf, size_t *lenp)
5052{
5053 char *dup;
5054 size_t len;
5055
5056 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005057 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005058 if (!dup)
5059 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005060 *(dup + len) = '\0';
5061 *buf += len;
5062
5063 if (lenp)
5064 *lenp = len;
5065
5066 return dup;
5067}
5068
5069/*
Alex Elder859c31d2012-10-25 23:34:42 -05005070 * Parse the options provided for an "rbd add" (i.e., rbd image
5071 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5072 * and the data written is passed here via a NUL-terminated buffer.
5073 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005074 *
Alex Elder859c31d2012-10-25 23:34:42 -05005075 * The information extracted from these options is recorded in
5076 * the other parameters which return dynamically-allocated
5077 * structures:
5078 * ceph_opts
5079 * The address of a pointer that will refer to a ceph options
5080 * structure. Caller must release the returned pointer using
5081 * ceph_destroy_options() when it is no longer needed.
5082 * rbd_opts
5083 * Address of an rbd options pointer. Fully initialized by
5084 * this function; caller must release with kfree().
5085 * spec
5086 * Address of an rbd image specification pointer. Fully
5087 * initialized by this function based on parsed options.
5088 * Caller must release with rbd_spec_put().
5089 *
5090 * The options passed take this form:
5091 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5092 * where:
5093 * <mon_addrs>
5094 * A comma-separated list of one or more monitor addresses.
5095 * A monitor address is an ip address, optionally followed
5096 * by a port number (separated by a colon).
5097 * I.e.: ip1[:port1][,ip2[:port2]...]
5098 * <options>
5099 * A comma-separated list of ceph and/or rbd options.
5100 * <pool_name>
5101 * The name of the rados pool containing the rbd image.
5102 * <image_name>
5103 * The name of the image in that pool to map.
5104 * <snap_id>
5105 * An optional snapshot id. If provided, the mapping will
5106 * present data from the image at the time that snapshot was
5107 * created. The image head is used if no snapshot id is
5108 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005109 */
Alex Elder859c31d2012-10-25 23:34:42 -05005110static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005111 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005112 struct rbd_options **opts,
5113 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005114{
Alex Elderd22f76e2012-07-12 10:46:35 -05005115 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005116 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005117 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005118 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005119 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05005120 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005121 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005122 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005123 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005124
5125 /* The first four tokens are required */
5126
Alex Elder7ef32142012-02-02 08:13:30 -06005127 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005128 if (!len) {
5129 rbd_warn(NULL, "no monitor address(es) provided");
5130 return -EINVAL;
5131 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005132 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005133 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005134 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005135
Alex Elderdc79b112012-10-25 23:34:41 -05005136 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005137 options = dup_token(&buf, NULL);
5138 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005139 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005140 if (!*options) {
5141 rbd_warn(NULL, "no options provided");
5142 goto out_err;
5143 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005144
Alex Elder859c31d2012-10-25 23:34:42 -05005145 spec = rbd_spec_alloc();
5146 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005147 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005148
5149 spec->pool_name = dup_token(&buf, NULL);
5150 if (!spec->pool_name)
5151 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005152 if (!*spec->pool_name) {
5153 rbd_warn(NULL, "no pool name provided");
5154 goto out_err;
5155 }
Alex Eldere28fff262012-02-02 08:13:30 -06005156
Alex Elder69e7a022012-11-01 08:39:26 -05005157 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05005158 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005159 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005160 if (!*spec->image_name) {
5161 rbd_warn(NULL, "no image name provided");
5162 goto out_err;
5163 }
Alex Eldere28fff262012-02-02 08:13:30 -06005164
Alex Elderf28e5652012-10-25 23:34:41 -05005165 /*
5166 * Snapshot name is optional; default is to use "-"
5167 * (indicating the head/no snapshot).
5168 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005169 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005170 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005171 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5172 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005173 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005174 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005175 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005176 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005177 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5178 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005179 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005180 *(snap_name + len) = '\0';
5181 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005182
Alex Elder0ddebc02012-10-25 23:34:41 -05005183 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005184
Alex Elder4e9afeb2012-10-25 23:34:41 -05005185 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
5186 if (!rbd_opts)
5187 goto out_mem;
5188
5189 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Ilya Dryomovb5584182015-06-23 16:21:19 +03005190 rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov80de1912016-09-20 14:23:17 +02005191 rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
Ilya Dryomove010dd02017-04-13 12:17:39 +02005192 rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005193
Alex Elder859c31d2012-10-25 23:34:42 -05005194 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05005195 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05005196 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05005197 if (IS_ERR(copts)) {
5198 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005199 goto out_err;
5200 }
Alex Elder859c31d2012-10-25 23:34:42 -05005201 kfree(options);
5202
5203 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005204 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05005205 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005206
Alex Elderdc79b112012-10-25 23:34:41 -05005207 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005208out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005209 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005210out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05005211 kfree(rbd_opts);
5212 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005213 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005214
Alex Elderdc79b112012-10-25 23:34:41 -05005215 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005216}
5217
Ilya Dryomove010dd02017-04-13 12:17:39 +02005218static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5219{
5220 down_write(&rbd_dev->lock_rwsem);
5221 if (__rbd_is_lock_owner(rbd_dev))
5222 rbd_unlock(rbd_dev);
5223 up_write(&rbd_dev->lock_rwsem);
5224}
5225
5226static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5227{
5228 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5229 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5230 return -EINVAL;
5231 }
5232
5233 /* FIXME: "rbd map --exclusive" should be in interruptible */
5234 down_read(&rbd_dev->lock_rwsem);
5235 rbd_wait_state_locked(rbd_dev);
5236 up_read(&rbd_dev->lock_rwsem);
5237 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
5238 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5239 return -EROFS;
5240 }
5241
5242 return 0;
5243}
5244
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005245/*
Alex Elder589d30e2012-07-10 20:30:11 -05005246 * An rbd format 2 image has a unique identifier, distinct from the
5247 * name given to it by the user. Internally, that identifier is
5248 * what's used to specify the names of objects related to the image.
5249 *
5250 * A special "rbd id" object is used to map an rbd image name to its
5251 * id. If that object doesn't exist, then there is no v2 rbd image
5252 * with the supplied name.
5253 *
5254 * This function will record the given rbd_dev's image_id field if
5255 * it can be determined, and in that case will return 0. If any
5256 * errors occur a negative errno will be returned and the rbd_dev's
5257 * image_id field will be unchanged (and should be NULL).
5258 */
5259static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5260{
5261 int ret;
5262 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005263 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005264 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005265 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005266
Alex Elder589d30e2012-07-10 20:30:11 -05005267 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005268 * When probing a parent image, the image id is already
5269 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005270 * need to fetch the image id again in this case. We
5271 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005272 */
Alex Elderc0fba362013-04-25 23:15:08 -05005273 if (rbd_dev->spec->image_id) {
5274 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5275
Alex Elder2c0d0a12012-10-30 19:40:33 -05005276 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005277 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005278
5279 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005280 * First, see if the format 2 image id file exists, and if
5281 * so, get the image's persistent id from it.
5282 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005283 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5284 rbd_dev->spec->image_name);
5285 if (ret)
5286 return ret;
5287
5288 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005289
5290 /* Response will be an encoded string, which includes a length */
5291
5292 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5293 response = kzalloc(size, GFP_NOIO);
5294 if (!response) {
5295 ret = -ENOMEM;
5296 goto out;
5297 }
5298
Alex Elderc0fba362013-04-25 23:15:08 -05005299 /* If it doesn't exist we'll assume it's a format 1 image */
5300
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005301 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5302 "get_id", NULL, 0,
5303 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005304 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005305 if (ret == -ENOENT) {
5306 image_id = kstrdup("", GFP_KERNEL);
5307 ret = image_id ? 0 : -ENOMEM;
5308 if (!ret)
5309 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005310 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005311 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005312
Alex Elderc0fba362013-04-25 23:15:08 -05005313 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005314 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005315 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005316 if (!ret)
5317 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005318 }
5319
5320 if (!ret) {
5321 rbd_dev->spec->image_id = image_id;
5322 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005323 }
5324out:
5325 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005326 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005327 return ret;
5328}
5329
Alex Elder3abef3b2013-05-13 20:35:37 -05005330/*
5331 * Undo whatever state changes are made by v1 or v2 header info
5332 * call.
5333 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005334static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5335{
5336 struct rbd_image_header *header;
5337
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005338 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005339
5340 /* Free dynamic fields from the header, then zero it out */
5341
5342 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005343 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005344 kfree(header->snap_sizes);
5345 kfree(header->snap_names);
5346 kfree(header->object_prefix);
5347 memset(header, 0, sizeof (*header));
5348}
5349
Alex Elder2df3fac2013-05-06 09:51:30 -05005350static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005351{
5352 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005353
Alex Elder1e130192012-07-03 16:01:19 -05005354 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005355 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005356 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005357
Alex Elder2df3fac2013-05-06 09:51:30 -05005358 /*
5359 * Get the and check features for the image. Currently the
5360 * features are assumed to never change.
5361 */
Alex Elderb1b54022012-07-03 16:01:19 -05005362 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005363 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005364 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005365
Alex Eldercc070d52013-04-21 12:14:45 -05005366 /* If the image supports fancy striping, get its parameters */
5367
5368 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5369 ret = rbd_dev_v2_striping_info(rbd_dev);
5370 if (ret < 0)
5371 goto out_err;
5372 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005373
Ilya Dryomov7e973322017-01-25 18:16:22 +01005374 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5375 ret = rbd_dev_v2_data_pool(rbd_dev);
5376 if (ret)
5377 goto out_err;
5378 }
5379
Ilya Dryomov263423f2017-01-25 18:16:22 +01005380 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005381 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005382
Alex Elder9d475de2012-07-03 16:01:19 -05005383out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005384 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005385 kfree(rbd_dev->header.object_prefix);
5386 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005387 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005388}
5389
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005390/*
5391 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5392 * rbd_dev_image_probe() recursion depth, which means it's also the
5393 * length of the already discovered part of the parent chain.
5394 */
5395static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005396{
Alex Elder2f82ee52012-10-30 19:40:33 -05005397 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005398 int ret;
5399
5400 if (!rbd_dev->parent_spec)
5401 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005402
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005403 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5404 pr_info("parent chain is too long (%d)\n", depth);
5405 ret = -EINVAL;
5406 goto out_err;
5407 }
5408
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005409 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005410 if (!parent) {
5411 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005412 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005413 }
5414
5415 /*
5416 * Images related by parent/child relationships always share
5417 * rbd_client and spec/parent_spec, so bump their refcounts.
5418 */
5419 __rbd_get_client(rbd_dev->rbd_client);
5420 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005421
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005422 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005423 if (ret < 0)
5424 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005425
Alex Elder124afba2013-04-26 15:44:36 -05005426 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005427 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005428 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005429
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005430out_err:
5431 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005432 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005433 return ret;
5434}
5435
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005436static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5437{
5438 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5439 rbd_dev_mapping_clear(rbd_dev);
5440 rbd_free_disk(rbd_dev);
5441 if (!single_major)
5442 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5443}
5444
Ilya Dryomov811c6682016-04-15 16:22:16 +02005445/*
5446 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5447 * upon return.
5448 */
Alex Elder200a6a82013-04-28 23:32:34 -05005449static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005450{
Alex Elder83a06262012-10-30 15:47:17 -05005451 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005452
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005453 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005454
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005455 if (!single_major) {
5456 ret = register_blkdev(0, rbd_dev->name);
5457 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005458 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005459
5460 rbd_dev->major = ret;
5461 rbd_dev->minor = 0;
5462 } else {
5463 rbd_dev->major = rbd_major;
5464 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5465 }
Alex Elder83a06262012-10-30 15:47:17 -05005466
5467 /* Set up the blkdev mapping. */
5468
5469 ret = rbd_init_disk(rbd_dev);
5470 if (ret)
5471 goto err_out_blkdev;
5472
Alex Elderf35a4de2013-05-06 09:51:29 -05005473 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005474 if (ret)
5475 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005476
Alex Elderf35a4de2013-05-06 09:51:29 -05005477 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005478 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005479
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005480 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005481 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005482 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005483
Alex Elder129b79d2013-04-26 15:44:36 -05005484 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005485 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005486 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005487
Alex Elderf35a4de2013-05-06 09:51:29 -05005488err_out_mapping:
5489 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005490err_out_disk:
5491 rbd_free_disk(rbd_dev);
5492err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005493 if (!single_major)
5494 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005495err_out_unlock:
5496 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005497 return ret;
5498}
5499
Alex Elder332bb122013-04-27 09:59:30 -05005500static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5501{
5502 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005503 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005504
5505 /* Record the header object name for this rbd image. */
5506
5507 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005508 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005509 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5510 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005511 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005512 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5513 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005514
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005515 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005516}
5517
Alex Elder200a6a82013-04-28 23:32:34 -05005518static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5519{
Alex Elder6fd48b32013-04-28 23:32:34 -05005520 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005521 if (rbd_dev->opts)
5522 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005523 rbd_dev->image_format = 0;
5524 kfree(rbd_dev->spec->image_id);
5525 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005526}
5527
Alex Eldera30b71b2012-07-10 20:30:11 -05005528/*
5529 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005530 * device. If this image is the one being mapped (i.e., not a
5531 * parent), initiate a watch on its header object before using that
5532 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005533 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005534static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005535{
5536 int ret;
5537
5538 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005539 * Get the id from the image id object. Unless there's an
5540 * error, rbd_dev->spec->image_id will be filled in with
5541 * a dynamically-allocated string, and rbd_dev->image_format
5542 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005543 */
5544 ret = rbd_dev_image_id(rbd_dev);
5545 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005546 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005547
Alex Elder332bb122013-04-27 09:59:30 -05005548 ret = rbd_dev_header_name(rbd_dev);
5549 if (ret)
5550 goto err_out_format;
5551
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005552 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005553 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005554 if (ret) {
5555 if (ret == -ENOENT)
5556 pr_info("image %s/%s does not exist\n",
5557 rbd_dev->spec->pool_name,
5558 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005559 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005560 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005561 }
Alex Elderb644de22013-04-27 09:59:31 -05005562
Ilya Dryomova720ae02014-07-23 17:11:19 +04005563 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005564 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005565 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005566
Ilya Dryomov04077592014-07-23 17:11:20 +04005567 /*
5568 * If this image is the one being mapped, we have pool name and
5569 * id, image name and id, and snap name - need to fill snap id.
5570 * Otherwise this is a parent image, identified by pool, image
5571 * and snap ids - need to fill in names for those ids.
5572 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005573 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005574 ret = rbd_spec_fill_snap_id(rbd_dev);
5575 else
5576 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005577 if (ret) {
5578 if (ret == -ENOENT)
5579 pr_info("snap %s/%s@%s does not exist\n",
5580 rbd_dev->spec->pool_name,
5581 rbd_dev->spec->image_name,
5582 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005583 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005584 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005585
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005586 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5587 ret = rbd_dev_v2_parent_info(rbd_dev);
5588 if (ret)
5589 goto err_out_probe;
5590
5591 /*
5592 * Need to warn users if this image is the one being
5593 * mapped and has a parent.
5594 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005595 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005596 rbd_warn(rbd_dev,
5597 "WARNING: kernel layering is EXPERIMENTAL!");
5598 }
5599
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005600 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005601 if (ret)
5602 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005603
Alex Elder30d60ba2013-05-06 09:51:30 -05005604 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005605 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005606 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005607
Alex Elder6fd48b32013-04-28 23:32:34 -05005608err_out_probe:
5609 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005610err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005611 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005612 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005613err_out_format:
5614 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005615 kfree(rbd_dev->spec->image_id);
5616 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005617 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005618}
5619
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005620static ssize_t do_rbd_add(struct bus_type *bus,
5621 const char *buf,
5622 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005623{
Alex Eldercb8627c2012-07-09 21:04:23 -05005624 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005625 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005626 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005627 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005628 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005629 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005630
5631 if (!try_module_get(THIS_MODULE))
5632 return -ENODEV;
5633
Alex Eldera725f65e2012-02-02 08:13:30 -06005634 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005635 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005636 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005637 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06005638
Alex Elder9d3997f2012-10-25 23:34:42 -05005639 rbdc = rbd_get_client(ceph_opts);
5640 if (IS_ERR(rbdc)) {
5641 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005642 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005643 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005644
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005645 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01005646 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005647 if (rc < 0) {
5648 if (rc == -ENOENT)
5649 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005650 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005651 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05005652 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005653
Ilya Dryomovd1475432015-06-22 13:24:48 +03005654 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005655 if (!rbd_dev) {
5656 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05005657 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005658 }
Alex Elderc53d5892012-10-25 23:34:42 -05005659 rbdc = NULL; /* rbd_dev now owns this */
5660 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03005661 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005662
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005663 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5664 if (!rbd_dev->config_info) {
5665 rc = -ENOMEM;
5666 goto err_out_rbd_dev;
5667 }
5668
Ilya Dryomov811c6682016-04-15 16:22:16 +02005669 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005670 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005671 if (rc < 0) {
5672 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05005673 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005674 }
Alex Elder05fd6f62012-08-29 17:11:07 -05005675
Alex Elder7ce4eef2013-05-06 17:40:33 -05005676 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05005677 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02005678 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05005679
Alex Elderb536f692013-04-28 23:32:34 -05005680 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005681 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005682 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05005683
Ilya Dryomove010dd02017-04-13 12:17:39 +02005684 if (rbd_dev->opts->exclusive) {
5685 rc = rbd_add_acquire_lock(rbd_dev);
5686 if (rc)
5687 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05005688 }
5689
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005690 /* Everything's ready. Announce the disk to the world. */
5691
5692 rc = device_add(&rbd_dev->dev);
5693 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02005694 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005695
5696 add_disk(rbd_dev->disk);
5697 /* see rbd_init_disk() */
5698 blk_put_queue(rbd_dev->disk->queue);
5699
5700 spin_lock(&rbd_dev_list_lock);
5701 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5702 spin_unlock(&rbd_dev_list_lock);
5703
5704 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5705 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5706 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005707 rc = count;
5708out:
5709 module_put(THIS_MODULE);
5710 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05005711
Ilya Dryomove010dd02017-04-13 12:17:39 +02005712err_out_image_lock:
5713 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005714err_out_device_setup:
5715 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005716err_out_image_probe:
5717 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005718err_out_rbd_dev:
5719 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005720err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005721 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005722err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05005723 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03005724 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005725 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005726}
5727
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005728static ssize_t rbd_add(struct bus_type *bus,
5729 const char *buf,
5730 size_t count)
5731{
5732 if (single_major)
5733 return -EINVAL;
5734
5735 return do_rbd_add(bus, buf, count);
5736}
5737
5738static ssize_t rbd_add_single_major(struct bus_type *bus,
5739 const char *buf,
5740 size_t count)
5741{
5742 return do_rbd_add(bus, buf, count);
5743}
5744
Alex Elder05a46af2013-04-26 15:44:36 -05005745static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5746{
Alex Elderad945fc2013-04-26 15:44:36 -05005747 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005748 struct rbd_device *first = rbd_dev;
5749 struct rbd_device *second = first->parent;
5750 struct rbd_device *third;
5751
5752 /*
5753 * Follow to the parent with no grandparent and
5754 * remove it.
5755 */
5756 while (second && (third = second->parent)) {
5757 first = second;
5758 second = third;
5759 }
Alex Elderad945fc2013-04-26 15:44:36 -05005760 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005761 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005762 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005763 first->parent = NULL;
5764 first->parent_overlap = 0;
5765
5766 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005767 rbd_spec_put(first->parent_spec);
5768 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005769 }
5770}
5771
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005772static ssize_t do_rbd_remove(struct bus_type *bus,
5773 const char *buf,
5774 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005775{
5776 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05005777 struct list_head *tmp;
5778 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02005779 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05005780 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02005781 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05005782 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005783
Mike Christie0276dca2016-08-18 18:38:45 +02005784 dev_id = -1;
5785 opt_buf[0] = '\0';
5786 sscanf(buf, "%d %5s", &dev_id, opt_buf);
5787 if (dev_id < 0) {
5788 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005789 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02005790 }
5791 if (opt_buf[0] != '\0') {
5792 if (!strcmp(opt_buf, "force")) {
5793 force = true;
5794 } else {
5795 pr_err("bad remove option at '%s'\n", opt_buf);
5796 return -EINVAL;
5797 }
5798 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005799
Alex Elder751cc0e2013-05-31 15:17:01 -05005800 ret = -ENOENT;
5801 spin_lock(&rbd_dev_list_lock);
5802 list_for_each(tmp, &rbd_dev_list) {
5803 rbd_dev = list_entry(tmp, struct rbd_device, node);
5804 if (rbd_dev->dev_id == dev_id) {
5805 ret = 0;
5806 break;
5807 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005808 }
Alex Elder751cc0e2013-05-31 15:17:01 -05005809 if (!ret) {
5810 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02005811 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05005812 ret = -EBUSY;
5813 else
Alex Elder82a442d2013-05-31 17:40:44 -05005814 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5815 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05005816 spin_unlock_irq(&rbd_dev->lock);
5817 }
5818 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05005819 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005820 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05005821
Mike Christie0276dca2016-08-18 18:38:45 +02005822 if (force) {
5823 /*
5824 * Prevent new IO from being queued and wait for existing
5825 * IO to complete/fail.
5826 */
5827 blk_mq_freeze_queue(rbd_dev->disk->queue);
5828 blk_set_queue_dying(rbd_dev->disk->queue);
5829 }
5830
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005831 del_gendisk(rbd_dev->disk);
5832 spin_lock(&rbd_dev_list_lock);
5833 list_del_init(&rbd_dev->node);
5834 spin_unlock(&rbd_dev_list_lock);
5835 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02005836
Ilya Dryomove010dd02017-04-13 12:17:39 +02005837 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005838 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005839 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005840 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005841 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005842}
5843
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005844static ssize_t rbd_remove(struct bus_type *bus,
5845 const char *buf,
5846 size_t count)
5847{
5848 if (single_major)
5849 return -EINVAL;
5850
5851 return do_rbd_remove(bus, buf, count);
5852}
5853
5854static ssize_t rbd_remove_single_major(struct bus_type *bus,
5855 const char *buf,
5856 size_t count)
5857{
5858 return do_rbd_remove(bus, buf, count);
5859}
5860
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005861/*
5862 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005863 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005864 */
5865static int rbd_sysfs_init(void)
5866{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005867 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005868
Alex Elderfed4c142012-02-07 12:03:36 -06005869 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005870 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005871 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005872
Alex Elderfed4c142012-02-07 12:03:36 -06005873 ret = bus_register(&rbd_bus_type);
5874 if (ret < 0)
5875 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005876
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005877 return ret;
5878}
5879
5880static void rbd_sysfs_cleanup(void)
5881{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005882 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005883 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005884}
5885
Alex Elder1c2a9df2013-05-01 12:43:03 -05005886static int rbd_slab_init(void)
5887{
5888 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005889 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05005890 if (!rbd_img_request_cache)
5891 return -ENOMEM;
5892
5893 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005894 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05005895 if (!rbd_obj_request_cache)
5896 goto out_err;
5897
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005898 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005899
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005900out_err:
Alex Elder868311b2013-05-01 12:43:03 -05005901 kmem_cache_destroy(rbd_img_request_cache);
5902 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005903 return -ENOMEM;
5904}
5905
5906static void rbd_slab_exit(void)
5907{
Alex Elder868311b2013-05-01 12:43:03 -05005908 rbd_assert(rbd_obj_request_cache);
5909 kmem_cache_destroy(rbd_obj_request_cache);
5910 rbd_obj_request_cache = NULL;
5911
Alex Elder1c2a9df2013-05-01 12:43:03 -05005912 rbd_assert(rbd_img_request_cache);
5913 kmem_cache_destroy(rbd_img_request_cache);
5914 rbd_img_request_cache = NULL;
5915}
5916
Alex Eldercc344fa2013-02-19 12:25:56 -06005917static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005918{
5919 int rc;
5920
Alex Elder1e32d342013-01-30 11:13:33 -06005921 if (!libceph_compatible(NULL)) {
5922 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06005923 return -EINVAL;
5924 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005925
Alex Elder1c2a9df2013-05-01 12:43:03 -05005926 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005927 if (rc)
5928 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005929
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005930 /*
5931 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03005932 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005933 */
5934 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
5935 if (!rbd_wq) {
5936 rc = -ENOMEM;
5937 goto err_out_slab;
5938 }
5939
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005940 if (single_major) {
5941 rbd_major = register_blkdev(0, RBD_DRV_NAME);
5942 if (rbd_major < 0) {
5943 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005944 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005945 }
5946 }
5947
Alex Elder1c2a9df2013-05-01 12:43:03 -05005948 rc = rbd_sysfs_init();
5949 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005950 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005951
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005952 if (single_major)
5953 pr_info("loaded (major %d)\n", rbd_major);
5954 else
5955 pr_info("loaded\n");
5956
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005957 return 0;
5958
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005959err_out_blkdev:
5960 if (single_major)
5961 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005962err_out_wq:
5963 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005964err_out_slab:
5965 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05005966 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005967}
5968
Alex Eldercc344fa2013-02-19 12:25:56 -06005969static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005970{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04005971 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005972 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005973 if (single_major)
5974 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005975 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05005976 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005977}
5978
5979module_init(rbd_init);
5980module_exit(rbd_exit);
5981
Alex Elderd552c612013-05-31 20:13:09 -05005982MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005983MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5984MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005985/* following authorship retained from original osdblk.c */
5986MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5987
Ilya Dryomov90da2582013-12-13 15:28:56 +02005988MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005989MODULE_LICENSE("GPL");