blob: 6ce9e0b35461e97c5276efb528d7001798ef8d21 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070036#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050037#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070038
39#include <linux/kernel.h>
40#include <linux/device.h>
41#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010042#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070043#include <linux/fs.h>
44#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050045#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020046#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040047#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070048
49#include "rbd_types.h"
50
Alex Elderaafb2302012-09-06 16:00:54 -050051#define RBD_DEBUG /* Activate rbd_assert() calls */
52
Alex Elder593a9e72012-02-07 12:03:37 -060053/*
54 * The basic unit of block I/O is a sector. It is interpreted in a
55 * number of contexts in Linux (blk, bio, genhd), but the default is
56 * universally 512 bytes. These symbols are just slightly more
57 * meaningful than the bare numbers they represent.
58 */
59#define SECTOR_SHIFT 9
60#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
61
Alex Eldera2acd002013-05-08 22:50:04 -050062/*
63 * Increment the given counter and return its updated value.
64 * If the counter is already 0 it will not be incremented.
65 * If the counter is already at its maximum value returns
66 * -EINVAL without updating it.
67 */
68static int atomic_inc_return_safe(atomic_t *v)
69{
70 unsigned int counter;
71
72 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73 if (counter <= (unsigned int)INT_MAX)
74 return (int)counter;
75
76 atomic_dec(v);
77
78 return -EINVAL;
79}
80
81/* Decrement the counter. Return the resulting value, or -EINVAL */
82static int atomic_dec_return_safe(atomic_t *v)
83{
84 int counter;
85
86 counter = atomic_dec_return(v);
87 if (counter >= 0)
88 return counter;
89
90 atomic_inc(v);
91
92 return -EINVAL;
93}
94
Alex Elderf0f8cef2012-01-29 13:57:44 -060095#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096
Ilya Dryomov7e513d42013-12-16 19:26:32 +020097#define RBD_MINORS_PER_MAJOR 256
98#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200100#define RBD_MAX_PARENT_CHAIN_LEN 16
101
Alex Elderd4b125e2012-07-03 16:01:19 -0500102#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
103#define RBD_MAX_SNAP_NAME_LEN \
104 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105
Alex Elder35d489f2012-07-03 16:01:19 -0500106#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700107
108#define RBD_SNAP_HEAD_NAME "-"
109
Alex Elder9682fc62013-04-30 00:44:33 -0500110#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
111
Alex Elder9e15b772012-10-30 19:40:33 -0500112/* This allows a single page to hold an image name sent by OSD */
113#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500114#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500115
Alex Elder1e130192012-07-03 16:01:19 -0500116#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500117
Ilya Dryomoved95b212016-08-12 16:40:02 +0200118#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200119#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
120
Alex Elderd8891402012-10-09 13:50:17 -0700121/* Feature bits */
122
Ilya Dryomov8767b292017-03-02 19:56:57 +0100123#define RBD_FEATURE_LAYERING (1ULL<<0)
124#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
125#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
126#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100127#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100128
Ilya Dryomoved95b212016-08-12 16:40:02 +0200129#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
130 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100131 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100132 RBD_FEATURE_DATA_POOL | \
133 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700134
135/* Features supported by this (client software) implementation. */
136
Alex Elder770eba62012-10-25 23:34:40 -0500137#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700138
Alex Elder81a89792012-02-02 08:13:30 -0600139/*
140 * An RBD device name will be "rbd#", where the "rbd" comes from
141 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600142 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143#define DEV_NAME_LEN 32
144
145/*
146 * block device image metadata (in-memory version)
147 */
148struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500149 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500150 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500152 u64 stripe_unit;
153 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100154 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500155 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156
Alex Elderf84344f2012-08-31 17:29:51 -0500157 /* The remaining fields need to be updated occasionally */
158 u64 image_size;
159 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500160 char *snap_names; /* format 1 only */
161 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700162};
163
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500164/*
165 * An rbd image specification.
166 *
167 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500168 * identify an image. Each rbd_dev structure includes a pointer to
169 * an rbd_spec structure that encapsulates this identity.
170 *
171 * Each of the id's in an rbd_spec has an associated name. For a
172 * user-mapped image, the names are supplied and the id's associated
173 * with them are looked up. For a layered image, a parent image is
174 * defined by the tuple, and the names are looked up.
175 *
176 * An rbd_dev structure contains a parent_spec pointer which is
177 * non-null if the image it represents is a child in a layered
178 * image. This pointer will refer to the rbd_spec structure used
179 * by the parent rbd_dev for its own identity (i.e., the structure
180 * is shared between the parent and child).
181 *
182 * Since these structures are populated once, during the discovery
183 * phase of image construction, they are effectively immutable so
184 * we make no effort to synchronize access to them.
185 *
186 * Note that code herein does not assume the image name is known (it
187 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500188 */
189struct rbd_spec {
190 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500191 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500192
Alex Elderecb4dc22013-04-26 09:43:47 -0500193 const char *image_id;
194 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500195
196 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500197 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500198
199 struct kref kref;
200};
201
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700202/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600203 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700204 */
205struct rbd_client {
206 struct ceph_client *client;
207 struct kref kref;
208 struct list_head node;
209};
210
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212
Alex Elder9969ebc2013-01-18 12:31:10 -0600213enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100214 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100215 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100216 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Alex Elder9969ebc2013-01-18 12:31:10 -0600217};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600218
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800219enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100220 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800221 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800222 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800223};
224
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100225/*
226 * Writes go through the following state machine to deal with
227 * layering:
228 *
229 * need copyup
230 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
231 * | ^ |
232 * v \------------------------------/
233 * done
234 * ^
235 * |
236 * RBD_OBJ_WRITE_FLAT
237 *
238 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
239 * there is a parent or not.
240 */
241enum rbd_obj_write_state {
242 RBD_OBJ_WRITE_FLAT = 1,
243 RBD_OBJ_WRITE_GUARD,
244 RBD_OBJ_WRITE_COPYUP,
245};
246
Alex Elderbf0d5f502012-11-22 00:00:08 -0600247struct rbd_obj_request {
Ilya Dryomova90bb0c2017-01-25 18:16:23 +0100248 u64 object_no;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600249 u64 offset; /* object start byte */
250 u64 length; /* bytes from offset */
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100251 union {
252 bool tried_parent; /* for reads */
253 enum rbd_obj_write_state write_state; /* for writes */
254 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255
Ilya Dryomov51c35092018-01-29 14:04:08 +0100256 struct rbd_img_request *img_request;
257 u64 img_offset;
258 /* links for img_request->obj_requests list */
259 struct list_head links;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600260
Alex Elder788e2df2013-01-17 12:25:27 -0600261 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100262 struct ceph_bio_iter bio_pos;
Alex Elder788e2df2013-01-17 12:25:27 -0600263 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100264 struct ceph_bvec_iter bvec_pos;
265 u32 bvec_count;
Alex Elder788e2df2013-01-17 12:25:27 -0600266 };
267 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100268 struct bio_vec *copyup_bvecs;
269 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600270
271 struct ceph_osd_request *osd_req;
272
273 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800274 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600275
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276 struct kref kref;
277};
278
Alex Elder0c425242013-02-08 09:55:49 -0600279enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600280 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600281 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600282};
283
Alex Elderbf0d5f502012-11-22 00:00:08 -0600284struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600285 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100286 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100287 enum obj_request_type data_type;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600288 u64 offset; /* starting image byte offset */
289 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600290 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600291 union {
Alex Elder9849e982013-01-24 16:13:36 -0600292 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600294 };
295 union {
296 struct request *rq; /* block request */
297 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600298 };
Ilya Dryomov15961b42018-02-01 11:50:47 +0100299 spinlock_t completion_lock;
Alex Elder55f27e02013-04-10 12:34:25 -0500300 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600301 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600302
303 u32 obj_request_count;
Ilya Dryomov7114eda2018-02-01 11:50:47 +0100304 u32 pending_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600305 struct list_head obj_requests; /* rbd_obj_request structs */
306
307 struct kref kref;
308};
309
310#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600311 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600312#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600313 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600314
Ilya Dryomov99d16942016-08-12 16:11:41 +0200315enum rbd_watch_state {
316 RBD_WATCH_STATE_UNREGISTERED,
317 RBD_WATCH_STATE_REGISTERED,
318 RBD_WATCH_STATE_ERROR,
319};
320
Ilya Dryomoved95b212016-08-12 16:40:02 +0200321enum rbd_lock_state {
322 RBD_LOCK_STATE_UNLOCKED,
323 RBD_LOCK_STATE_LOCKED,
324 RBD_LOCK_STATE_RELEASING,
325};
326
327/* WatchNotify::ClientId */
328struct rbd_client_id {
329 u64 gid;
330 u64 handle;
331};
332
Alex Elderf84344f2012-08-31 17:29:51 -0500333struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500334 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500335 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500336};
337
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700338/*
339 * a single device
340 */
341struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500342 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700343
344 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200345 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347
Alex Eldera30b71b2012-07-10 20:30:11 -0500348 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349 struct rbd_client *rbd_client;
350
351 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
352
Alex Elderb82d1672013-01-14 12:43:31 -0600353 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354
355 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600356 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500357 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300358 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200359 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700360
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200361 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200362 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500363
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200364 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600365
Ilya Dryomov99d16942016-08-12 16:11:41 +0200366 struct mutex watch_mutex;
367 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200368 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200369 u64 watch_cookie;
370 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700371
Ilya Dryomoved95b212016-08-12 16:40:02 +0200372 struct rw_semaphore lock_rwsem;
373 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200374 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200375 struct rbd_client_id owner_cid;
376 struct work_struct acquired_lock_work;
377 struct work_struct released_lock_work;
378 struct delayed_work lock_dwork;
379 struct work_struct unlock_work;
380 wait_queue_head_t lock_waitq;
381
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200382 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383
Alex Elder86b00e02012-10-25 23:34:42 -0500384 struct rbd_spec *parent_spec;
385 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500386 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500387 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500388
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100389 /* Block layer tags. */
390 struct blk_mq_tag_set tag_set;
391
Josh Durginc6666012011-11-21 17:11:12 -0800392 /* protects updating the header */
393 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500394
395 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700396
397 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800398
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800399 /* sysfs related */
400 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600401 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800402};
403
Alex Elderb82d1672013-01-14 12:43:31 -0600404/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200405 * Flag bits for rbd_dev->flags:
406 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
407 * by rbd_dev->lock
408 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600409 */
Alex Elder6d292902013-01-14 12:43:31 -0600410enum rbd_dev_flags {
411 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600412 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200413 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600414};
415
Alex Eldercfbf6372013-05-31 17:40:45 -0500416static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600417
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700418static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600419static DEFINE_SPINLOCK(rbd_dev_list_lock);
420
Alex Elder432b8582012-01-29 13:57:44 -0600421static LIST_HEAD(rbd_client_list); /* clients */
422static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700423
Alex Elder78c2a442013-05-01 12:43:04 -0500424/* Slab caches for frequently-allocated structures */
425
Alex Elder1c2a9df2013-05-01 12:43:03 -0500426static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500427static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500428
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200429static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200430static DEFINE_IDA(rbd_dev_id_ida);
431
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400432static struct workqueue_struct *rbd_wq;
433
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200434/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100435 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200436 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100437static bool single_major = true;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200438module_param(single_major, bool, S_IRUGO);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100439MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200440
Alex Elderf0f8cef2012-01-29 13:57:44 -0600441static ssize_t rbd_add(struct bus_type *bus, const char *buf,
442 size_t count);
443static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
444 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200445static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
446 size_t count);
447static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
448 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200449static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Eldera2acd002013-05-08 22:50:04 -0500450static void rbd_spec_put(struct rbd_spec *spec);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600451
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200452static int rbd_dev_id_to_minor(int dev_id)
453{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200454 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200455}
456
457static int minor_to_rbd_dev_id(int minor)
458{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200459 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200460}
461
Ilya Dryomoved95b212016-08-12 16:40:02 +0200462static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
463{
464 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
465 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
466}
467
468static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
469{
470 bool is_lock_owner;
471
472 down_read(&rbd_dev->lock_rwsem);
473 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
474 up_read(&rbd_dev->lock_rwsem);
475 return is_lock_owner;
476}
477
Ilya Dryomov8767b292017-03-02 19:56:57 +0100478static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
479{
480 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
481}
482
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700483static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
484static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200485static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
486static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Ilya Dryomov8767b292017-03-02 19:56:57 +0100487static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700488
489static struct attribute *rbd_bus_attrs[] = {
490 &bus_attr_add.attr,
491 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200492 &bus_attr_add_single_major.attr,
493 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100494 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700495 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600496};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200497
498static umode_t rbd_bus_is_visible(struct kobject *kobj,
499 struct attribute *attr, int index)
500{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200501 if (!single_major &&
502 (attr == &bus_attr_add_single_major.attr ||
503 attr == &bus_attr_remove_single_major.attr))
504 return 0;
505
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200506 return attr->mode;
507}
508
509static const struct attribute_group rbd_bus_group = {
510 .attrs = rbd_bus_attrs,
511 .is_visible = rbd_bus_is_visible,
512};
513__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600514
515static struct bus_type rbd_bus_type = {
516 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700517 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600518};
519
520static void rbd_root_dev_release(struct device *dev)
521{
522}
523
524static struct device rbd_root_dev = {
525 .init_name = "rbd",
526 .release = rbd_root_dev_release,
527};
528
Alex Elder06ecc6c2012-11-01 10:17:15 -0500529static __printf(2, 3)
530void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
531{
532 struct va_format vaf;
533 va_list args;
534
535 va_start(args, fmt);
536 vaf.fmt = fmt;
537 vaf.va = &args;
538
539 if (!rbd_dev)
540 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
541 else if (rbd_dev->disk)
542 printk(KERN_WARNING "%s: %s: %pV\n",
543 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
544 else if (rbd_dev->spec && rbd_dev->spec->image_name)
545 printk(KERN_WARNING "%s: image %s: %pV\n",
546 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
547 else if (rbd_dev->spec && rbd_dev->spec->image_id)
548 printk(KERN_WARNING "%s: id %s: %pV\n",
549 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
550 else /* punt */
551 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
552 RBD_DRV_NAME, rbd_dev, &vaf);
553 va_end(args);
554}
555
Alex Elderaafb2302012-09-06 16:00:54 -0500556#ifdef RBD_DEBUG
557#define rbd_assert(expr) \
558 if (unlikely(!(expr))) { \
559 printk(KERN_ERR "\nAssertion failure in %s() " \
560 "at line %d:\n\n" \
561 "\trbd_assert(%s);\n\n", \
562 __func__, __LINE__, #expr); \
563 BUG(); \
564 }
565#else /* !RBD_DEBUG */
566# define rbd_assert(expr) ((void) 0)
567#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800568
Alex Elder05a46af2013-04-26 15:44:36 -0500569static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600570
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500571static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500572static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400573static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400574static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500575static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
576 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500577static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
578 u8 *order, u64 *snap_size);
579static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
580 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700581
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700582static int rbd_open(struct block_device *bdev, fmode_t mode)
583{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600584 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600585 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586
Alex Eldera14ea262013-02-05 13:23:12 -0600587 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600588 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
589 removing = true;
590 else
591 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600592 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600593 if (removing)
594 return -ENOENT;
595
Alex Elderc3e946c2012-11-16 09:29:16 -0600596 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700597
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700598 return 0;
599}
600
Al Virodb2a1442013-05-05 21:52:57 -0400601static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800602{
603 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600604 unsigned long open_count_before;
605
Alex Eldera14ea262013-02-05 13:23:12 -0600606 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600607 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600608 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600609 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800610
Alex Elderc3e946c2012-11-16 09:29:16 -0600611 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800612}
613
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800614static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
615{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200616 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800617
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200618 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800619 return -EFAULT;
620
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200621 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800622 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
623 return -EROFS;
624
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200625 /* Let blkdev_roset() handle it */
626 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800627}
628
629static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
630 unsigned int cmd, unsigned long arg)
631{
632 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200633 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800634
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800635 switch (cmd) {
636 case BLKROSET:
637 ret = rbd_ioctl_set_ro(rbd_dev, arg);
638 break;
639 default:
640 ret = -ENOTTY;
641 }
642
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800643 return ret;
644}
645
646#ifdef CONFIG_COMPAT
647static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
648 unsigned int cmd, unsigned long arg)
649{
650 return rbd_ioctl(bdev, mode, cmd, arg);
651}
652#endif /* CONFIG_COMPAT */
653
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700654static const struct block_device_operations rbd_bd_ops = {
655 .owner = THIS_MODULE,
656 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800657 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800658 .ioctl = rbd_ioctl,
659#ifdef CONFIG_COMPAT
660 .compat_ioctl = rbd_compat_ioctl,
661#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662};
663
664/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500665 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500666 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 */
Alex Elderf8c38922012-08-10 13:12:07 -0700668static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669{
670 struct rbd_client *rbdc;
671 int ret = -ENOMEM;
672
Alex Elder37206ee2013-02-20 17:32:08 -0600673 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
675 if (!rbdc)
676 goto out_opt;
677
678 kref_init(&rbdc->kref);
679 INIT_LIST_HEAD(&rbdc->node);
680
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100681 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500683 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500684 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685
686 ret = ceph_open_session(rbdc->client);
687 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500688 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689
Alex Elder432b8582012-01-29 13:57:44 -0600690 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700691 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600692 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700693
Alex Elder37206ee2013-02-20 17:32:08 -0600694 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600695
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500697out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700698 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500699out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700 kfree(rbdc);
701out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500702 if (ceph_opts)
703 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600704 dout("%s: error %d\n", __func__, ret);
705
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400706 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707}
708
Alex Elder2f82ee52012-10-30 19:40:33 -0500709static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
710{
711 kref_get(&rbdc->kref);
712
713 return rbdc;
714}
715
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700717 * Find a ceph client with specific addr and configuration. If
718 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700720static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721{
722 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700723 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700724
Alex Elder43ae4702012-07-03 16:01:18 -0500725 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726 return NULL;
727
Alex Elder1f7ba332012-08-10 13:12:07 -0700728 spin_lock(&rbd_client_list_lock);
729 list_for_each_entry(client_node, &rbd_client_list, node) {
730 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500731 __rbd_get_client(client_node);
732
Alex Elder1f7ba332012-08-10 13:12:07 -0700733 found = true;
734 break;
735 }
736 }
737 spin_unlock(&rbd_client_list_lock);
738
739 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700740}
741
742/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300743 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700744 */
745enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300746 Opt_queue_depth,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700747 Opt_last_int,
748 /* int args above */
749 Opt_last_string,
750 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700751 Opt_read_only,
752 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200753 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200754 Opt_exclusive,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300755 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700756};
757
Alex Elder43ae4702012-07-03 16:01:18 -0500758static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300759 {Opt_queue_depth, "queue_depth=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700760 /* int args above */
761 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500762 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700763 {Opt_read_only, "ro"}, /* Alternate spelling */
764 {Opt_read_write, "read_write"},
765 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200766 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200767 {Opt_exclusive, "exclusive"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300768 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700769};
770
Alex Elder98571b52013-01-20 14:44:42 -0600771struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300772 int queue_depth;
Alex Elder98571b52013-01-20 14:44:42 -0600773 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200774 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200775 bool exclusive;
Alex Elder98571b52013-01-20 14:44:42 -0600776};
777
Ilya Dryomovb5584182015-06-23 16:21:19 +0300778#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Alex Elder98571b52013-01-20 14:44:42 -0600779#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200780#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200781#define RBD_EXCLUSIVE_DEFAULT false
Alex Elder98571b52013-01-20 14:44:42 -0600782
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700783static int parse_rbd_opts_token(char *c, void *private)
784{
Alex Elder43ae4702012-07-03 16:01:18 -0500785 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700786 substring_t argstr[MAX_OPT_ARGS];
787 int token, intval, ret;
788
Alex Elder43ae4702012-07-03 16:01:18 -0500789 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700790 if (token < Opt_last_int) {
791 ret = match_int(&argstr[0], &intval);
792 if (ret < 0) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300793 pr_err("bad mount option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700794 return ret;
795 }
796 dout("got int token %d val %d\n", token, intval);
797 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300798 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700799 } else {
800 dout("got token %d\n", token);
801 }
802
803 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300804 case Opt_queue_depth:
805 if (intval < 1) {
806 pr_err("queue_depth out of range\n");
807 return -EINVAL;
808 }
809 rbd_opts->queue_depth = intval;
810 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700811 case Opt_read_only:
812 rbd_opts->read_only = true;
813 break;
814 case Opt_read_write:
815 rbd_opts->read_only = false;
816 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200817 case Opt_lock_on_read:
818 rbd_opts->lock_on_read = true;
819 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200820 case Opt_exclusive:
821 rbd_opts->exclusive = true;
822 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700823 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300824 /* libceph prints "bad option" msg */
825 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700826 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300827
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700828 return 0;
829}
830
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800831static char* obj_op_name(enum obj_operation_type op_type)
832{
833 switch (op_type) {
834 case OBJ_OP_READ:
835 return "read";
836 case OBJ_OP_WRITE:
837 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800838 case OBJ_OP_DISCARD:
839 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800840 default:
841 return "???";
842 }
843}
844
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700845/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846 * Get a ceph client with specific addr and configuration, if one does
Alex Elder7262cfc2013-05-16 15:04:20 -0500847 * not exist create it. Either way, ceph_opts is consumed by this
848 * function.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700849 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500850static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700851{
Alex Elderf8c38922012-08-10 13:12:07 -0700852 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700853
Alex Eldercfbf6372013-05-31 17:40:45 -0500854 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
Alex Elder1f7ba332012-08-10 13:12:07 -0700855 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500856 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500857 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500858 else
Alex Elderf8c38922012-08-10 13:12:07 -0700859 rbdc = rbd_client_create(ceph_opts);
Alex Eldercfbf6372013-05-31 17:40:45 -0500860 mutex_unlock(&client_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700861
Alex Elder9d3997f2012-10-25 23:34:42 -0500862 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700863}
864
865/*
866 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600867 *
Alex Elder432b8582012-01-29 13:57:44 -0600868 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700869 */
870static void rbd_client_release(struct kref *kref)
871{
872 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
873
Alex Elder37206ee2013-02-20 17:32:08 -0600874 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500875 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500877 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878
879 ceph_destroy_client(rbdc->client);
880 kfree(rbdc);
881}
882
883/*
884 * Drop reference to ceph client node. If it's not referenced anymore, release
885 * it.
886 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500887static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888{
Alex Elderc53d5892012-10-25 23:34:42 -0500889 if (rbdc)
890 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700891}
892
Alex Eldera30b71b2012-07-10 20:30:11 -0500893static bool rbd_image_format_valid(u32 image_format)
894{
895 return image_format == 1 || image_format == 2;
896}
897
Alex Elder8e94af82012-07-25 09:32:40 -0500898static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
899{
Alex Elder103a1502012-08-02 11:29:45 -0500900 size_t size;
901 u32 snap_count;
902
903 /* The header has to start with the magic rbd header text */
904 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
905 return false;
906
Alex Elderdb2388b2012-10-20 22:17:27 -0500907 /* The bio layer requires at least sector-sized I/O */
908
909 if (ondisk->options.order < SECTOR_SHIFT)
910 return false;
911
912 /* If we use u64 in a few spots we may be able to loosen this */
913
914 if (ondisk->options.order > 8 * sizeof (int) - 1)
915 return false;
916
Alex Elder103a1502012-08-02 11:29:45 -0500917 /*
918 * The size of a snapshot header has to fit in a size_t, and
919 * that limits the number of snapshots.
920 */
921 snap_count = le32_to_cpu(ondisk->snap_count);
922 size = SIZE_MAX - sizeof (struct ceph_snap_context);
923 if (snap_count > size / sizeof (__le64))
924 return false;
925
926 /*
927 * Not only that, but the size of the entire the snapshot
928 * header must also be representable in a size_t.
929 */
930 size -= snap_count * sizeof (__le64);
931 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
932 return false;
933
934 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500935}
936
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700937/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100938 * returns the size of an object in the image
939 */
940static u32 rbd_obj_bytes(struct rbd_image_header *header)
941{
942 return 1U << header->obj_order;
943}
944
Ilya Dryomov263423f2017-01-25 18:16:22 +0100945static void rbd_init_layout(struct rbd_device *rbd_dev)
946{
947 if (rbd_dev->header.stripe_unit == 0 ||
948 rbd_dev->header.stripe_count == 0) {
949 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
950 rbd_dev->header.stripe_count = 1;
951 }
952
953 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
954 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
955 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +0100956 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
957 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +0100958 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
959}
960
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100961/*
Alex Elderbb23e372013-05-06 09:51:29 -0500962 * Fill an rbd image header with information from the given format 1
963 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700964 */
Alex Elder662518b2013-05-06 09:51:29 -0500965static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -0500966 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967{
Alex Elder662518b2013-05-06 09:51:29 -0500968 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -0500969 bool first_time = header->object_prefix == NULL;
970 struct ceph_snap_context *snapc;
971 char *object_prefix = NULL;
972 char *snap_names = NULL;
973 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500974 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -0500975 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -0500976 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700977
Alex Elderbb23e372013-05-06 09:51:29 -0500978 /* Allocate this now to avoid having to handle failure below */
979
980 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +0100981 object_prefix = kstrndup(ondisk->object_prefix,
982 sizeof(ondisk->object_prefix),
983 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -0500984 if (!object_prefix)
985 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -0500986 }
987
988 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -0500989
Alex Elder103a1502012-08-02 11:29:45 -0500990 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -0500991 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
992 if (!snapc)
993 goto out_err;
994 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -0500996 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -0500997 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
998
Alex Elderbb23e372013-05-06 09:51:29 -0500999 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001000
Alex Elderbb23e372013-05-06 09:51:29 -05001001 if (snap_names_len > (u64)SIZE_MAX)
1002 goto out_2big;
1003 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1004 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001005 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001006
1007 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001008 snap_sizes = kmalloc_array(snap_count,
1009 sizeof(*header->snap_sizes),
1010 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001011 if (!snap_sizes)
1012 goto out_err;
1013
Alex Elderf785cc12012-08-23 23:22:06 -05001014 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001015 * Copy the names, and fill in each snapshot's id
1016 * and size.
1017 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001018 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001019 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001020 * snap_names_len bytes beyond the end of the
1021 * snapshot id array, this memcpy() is safe.
1022 */
Alex Elderbb23e372013-05-06 09:51:29 -05001023 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1024 snaps = ondisk->snaps;
1025 for (i = 0; i < snap_count; i++) {
1026 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1027 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1028 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001029 }
Alex Elder849b4262012-07-09 21:04:24 -05001030
Alex Elderbb23e372013-05-06 09:51:29 -05001031 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001032
Alex Elderbb23e372013-05-06 09:51:29 -05001033 if (first_time) {
1034 header->object_prefix = object_prefix;
1035 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001036 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001037 } else {
1038 ceph_put_snap_context(header->snapc);
1039 kfree(header->snap_names);
1040 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001041 }
1042
1043 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001044
Alex Elderf84344f2012-08-31 17:29:51 -05001045 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001046 header->snapc = snapc;
1047 header->snap_names = snap_names;
1048 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001049
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001050 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001051out_2big:
1052 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001053out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001054 kfree(snap_sizes);
1055 kfree(snap_names);
1056 ceph_put_snap_context(snapc);
1057 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001058
Alex Elderbb23e372013-05-06 09:51:29 -05001059 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001060}
1061
Alex Elder9682fc62013-04-30 00:44:33 -05001062static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1063{
1064 const char *snap_name;
1065
1066 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1067
1068 /* Skip over names until we find the one we are looking for */
1069
1070 snap_name = rbd_dev->header.snap_names;
1071 while (which--)
1072 snap_name += strlen(snap_name) + 1;
1073
1074 return kstrdup(snap_name, GFP_KERNEL);
1075}
1076
Alex Elder30d1cff2013-05-01 12:43:03 -05001077/*
1078 * Snapshot id comparison function for use with qsort()/bsearch().
1079 * Note that result is for snapshots in *descending* order.
1080 */
1081static int snapid_compare_reverse(const void *s1, const void *s2)
1082{
1083 u64 snap_id1 = *(u64 *)s1;
1084 u64 snap_id2 = *(u64 *)s2;
1085
1086 if (snap_id1 < snap_id2)
1087 return 1;
1088 return snap_id1 == snap_id2 ? 0 : -1;
1089}
1090
1091/*
1092 * Search a snapshot context to see if the given snapshot id is
1093 * present.
1094 *
1095 * Returns the position of the snapshot id in the array if it's found,
1096 * or BAD_SNAP_INDEX otherwise.
1097 *
1098 * Note: The snapshot array is in kept sorted (by the osd) in
1099 * reverse order, highest snapshot id first.
1100 */
Alex Elder9682fc62013-04-30 00:44:33 -05001101static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1102{
1103 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001104 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001105
Alex Elder30d1cff2013-05-01 12:43:03 -05001106 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1107 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001108
Alex Elder30d1cff2013-05-01 12:43:03 -05001109 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001110}
1111
Alex Elder2ad3d712013-04-30 00:44:33 -05001112static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1113 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001114{
1115 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001116 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001117
1118 which = rbd_dev_snap_index(rbd_dev, snap_id);
1119 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001120 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001121
Josh Durginda6a6b62013-09-04 17:57:31 -07001122 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1123 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001124}
1125
Alex Elder9e15b772012-10-30 19:40:33 -05001126static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1127{
Alex Elder9e15b772012-10-30 19:40:33 -05001128 if (snap_id == CEPH_NOSNAP)
1129 return RBD_SNAP_HEAD_NAME;
1130
Alex Elder54cac612013-04-30 00:44:33 -05001131 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1132 if (rbd_dev->image_format == 1)
1133 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001134
Alex Elder54cac612013-04-30 00:44:33 -05001135 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001136}
1137
Alex Elder2ad3d712013-04-30 00:44:33 -05001138static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1139 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001140{
Alex Elder2ad3d712013-04-30 00:44:33 -05001141 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1142 if (snap_id == CEPH_NOSNAP) {
1143 *snap_size = rbd_dev->header.image_size;
1144 } else if (rbd_dev->image_format == 1) {
1145 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001146
Alex Elder2ad3d712013-04-30 00:44:33 -05001147 which = rbd_dev_snap_index(rbd_dev, snap_id);
1148 if (which == BAD_SNAP_INDEX)
1149 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001150
Alex Elder2ad3d712013-04-30 00:44:33 -05001151 *snap_size = rbd_dev->header.snap_sizes[which];
1152 } else {
1153 u64 size = 0;
1154 int ret;
1155
1156 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1157 if (ret)
1158 return ret;
1159
1160 *snap_size = size;
1161 }
1162 return 0;
1163}
1164
1165static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1166 u64 *snap_features)
1167{
1168 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1169 if (snap_id == CEPH_NOSNAP) {
1170 *snap_features = rbd_dev->header.features;
1171 } else if (rbd_dev->image_format == 1) {
1172 *snap_features = 0; /* No features for format 1 */
1173 } else {
1174 u64 features = 0;
1175 int ret;
1176
1177 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1178 if (ret)
1179 return ret;
1180
1181 *snap_features = features;
1182 }
1183 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001184}
1185
Alex Elderd1cf5782013-04-27 09:59:30 -05001186static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001187{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001188 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001189 u64 size = 0;
1190 u64 features = 0;
1191 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001192
Alex Elder2ad3d712013-04-30 00:44:33 -05001193 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1194 if (ret)
1195 return ret;
1196 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1197 if (ret)
1198 return ret;
1199
1200 rbd_dev->mapping.size = size;
1201 rbd_dev->mapping.features = features;
1202
Alex Elder8b0241f2013-04-25 23:15:08 -05001203 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001204}
1205
Alex Elderd1cf5782013-04-27 09:59:30 -05001206static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1207{
1208 rbd_dev->mapping.size = 0;
1209 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001210}
1211
Alex Elder65ccfe22012-08-09 10:33:26 -07001212static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1213{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001214 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215
Alex Elder65ccfe22012-08-09 10:33:26 -07001216 return offset & (segment_size - 1);
1217}
1218
1219static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1220 u64 offset, u64 length)
1221{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001222 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Alex Elder65ccfe22012-08-09 10:33:26 -07001223
1224 offset &= segment_size - 1;
1225
Alex Elderaafb2302012-09-06 16:00:54 -05001226 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001227 if (offset + length > segment_size)
1228 length = segment_size - offset;
1229
1230 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231}
1232
Ilya Dryomov5359a172018-01-20 10:30:10 +01001233static void zero_bvec(struct bio_vec *bv)
1234{
1235 void *buf;
1236 unsigned long flags;
1237
1238 buf = bvec_kmap_irq(bv, &flags);
1239 memset(buf, 0, bv->bv_len);
1240 flush_dcache_page(bv->bv_page);
1241 bvec_kunmap_irq(buf, &flags);
1242}
1243
1244static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1245{
1246 struct ceph_bio_iter it = *bio_pos;
1247
1248 ceph_bio_iter_advance(&it, off);
1249 ceph_bio_iter_advance_step(&it, bytes, ({
1250 zero_bvec(&bv);
1251 }));
1252}
1253
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001254static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001255{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001256 struct ceph_bvec_iter it = *bvec_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001257
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001258 ceph_bvec_iter_advance(&it, off);
1259 ceph_bvec_iter_advance_step(&it, bytes, ({
1260 zero_bvec(&bv);
1261 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001262}
1263
1264/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001265 * Zero a range in @obj_req data buffer defined by a bio (list) or
1266 * bio_vec array.
1267 *
1268 * @off is relative to the start of the data buffer.
1269 */
1270static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1271 u32 bytes)
1272{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001273 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001274 case OBJ_REQUEST_BIO:
1275 zero_bios(&obj_req->bio_pos, off, bytes);
1276 break;
1277 case OBJ_REQUEST_BVECS:
1278 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1279 break;
1280 default:
1281 rbd_assert(0);
1282 }
1283}
1284
Ilya Dryomov96385562014-06-10 13:53:29 +04001285static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1286{
1287 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1288
1289 return obj_request->img_offset <
1290 round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1291}
1292
Alex Elderbf0d5f502012-11-22 00:00:08 -06001293static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1294{
Alex Elder37206ee2013-02-20 17:32:08 -06001295 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001296 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001297 kref_get(&obj_request->kref);
1298}
1299
1300static void rbd_obj_request_destroy(struct kref *kref);
1301static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1302{
1303 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001304 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001305 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001306 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1307}
1308
Alex Elder0f2d5be2014-04-26 14:21:44 +04001309static void rbd_img_request_get(struct rbd_img_request *img_request)
1310{
1311 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001312 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001313 kref_get(&img_request->kref);
1314}
1315
Alex Eldere93f3152013-05-08 22:50:04 -05001316static bool img_request_child_test(struct rbd_img_request *img_request);
1317static void rbd_parent_request_destroy(struct kref *kref);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001318static void rbd_img_request_destroy(struct kref *kref);
1319static void rbd_img_request_put(struct rbd_img_request *img_request)
1320{
1321 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001322 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001323 kref_read(&img_request->kref));
Alex Eldere93f3152013-05-08 22:50:04 -05001324 if (img_request_child_test(img_request))
1325 kref_put(&img_request->kref, rbd_parent_request_destroy);
1326 else
1327 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001328}
1329
1330static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1331 struct rbd_obj_request *obj_request)
1332{
Alex Elder25dcf952013-01-25 17:08:55 -06001333 rbd_assert(obj_request->img_request == NULL);
1334
Alex Elderb155e862013-04-15 14:50:37 -05001335 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001336 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001337 img_request->obj_request_count++;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01001338 img_request->pending_count++;
Alex Elder25dcf952013-01-25 17:08:55 -06001339 list_add_tail(&obj_request->links, &img_request->obj_requests);
Ilya Dryomov15961b42018-02-01 11:50:47 +01001340 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001341}
1342
1343static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1344 struct rbd_obj_request *obj_request)
1345{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001346 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001347 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001348 rbd_assert(img_request->obj_request_count > 0);
1349 img_request->obj_request_count--;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001350 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001351 rbd_obj_request_put(obj_request);
1352}
1353
Ilya Dryomov980917f2016-09-12 18:59:42 +02001354static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001355{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001356 struct ceph_osd_request *osd_req = obj_request->osd_req;
1357
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001358 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
1359 obj_request, obj_request->object_no, obj_request->offset,
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001360 obj_request->length, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001361 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001362}
1363
Alex Elder0c425242013-02-08 09:55:49 -06001364/*
1365 * The default/initial value for all image request flags is 0. Each
1366 * is conditionally set to 1 at image request initialization time
1367 * and currently never change thereafter.
1368 */
Alex Elder9849e982013-01-24 16:13:36 -06001369static void img_request_child_set(struct rbd_img_request *img_request)
1370{
1371 set_bit(IMG_REQ_CHILD, &img_request->flags);
1372 smp_mb();
1373}
1374
Alex Eldere93f3152013-05-08 22:50:04 -05001375static void img_request_child_clear(struct rbd_img_request *img_request)
1376{
1377 clear_bit(IMG_REQ_CHILD, &img_request->flags);
1378 smp_mb();
1379}
1380
Alex Elder9849e982013-01-24 16:13:36 -06001381static bool img_request_child_test(struct rbd_img_request *img_request)
1382{
1383 smp_mb();
1384 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1385}
1386
Alex Elderd0b2e942013-01-24 16:13:36 -06001387static void img_request_layered_set(struct rbd_img_request *img_request)
1388{
1389 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1390 smp_mb();
1391}
1392
Alex Eldera2acd002013-05-08 22:50:04 -05001393static void img_request_layered_clear(struct rbd_img_request *img_request)
1394{
1395 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1396 smp_mb();
1397}
1398
Alex Elderd0b2e942013-01-24 16:13:36 -06001399static bool img_request_layered_test(struct rbd_img_request *img_request)
1400{
1401 smp_mb();
1402 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1403}
1404
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001405static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1406{
1407 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1408
1409 return !obj_req->offset &&
1410 obj_req->length == rbd_dev->layout.object_size;
1411}
1412
1413static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1414{
1415 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1416
1417 return obj_req->offset + obj_req->length ==
1418 rbd_dev->layout.object_size;
1419}
1420
1421static bool rbd_img_is_write(struct rbd_img_request *img_req)
1422{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001423 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001424 case OBJ_OP_READ:
1425 return false;
1426 case OBJ_OP_WRITE:
1427 case OBJ_OP_DISCARD:
1428 return true;
1429 default:
1430 rbd_assert(0);
1431 }
1432}
1433
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001434static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
1435
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001436static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001437{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001438 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001439
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001440 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1441 osd_req->r_result, obj_req);
1442 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001443
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001444 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1445 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1446 obj_req->xferred = osd_req->r_result;
1447 else
1448 /*
1449 * Writes aren't allowed to return a data payload. In some
1450 * guarded write cases (e.g. stat + zero on an empty object)
1451 * a stat response makes it through, but we don't care.
1452 */
1453 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001454
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001455 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001456}
1457
Alex Elder9d4df012013-04-19 15:34:50 -05001458static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001459{
Alex Elder8c042b02013-04-03 01:28:58 -05001460 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001461
Ilya Dryomova162b302018-01-30 17:52:10 +01001462 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001463 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001464}
1465
1466static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1467{
Alex Elder9d4df012013-04-19 15:34:50 -05001468 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001469
Ilya Dryomova162b302018-01-30 17:52:10 +01001470 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Deepa Dinamani1134e092017-05-08 15:59:19 -07001471 ktime_get_real_ts(&osd_req->r_mtime);
Ilya Dryomovbb873b52016-05-26 00:29:52 +02001472 osd_req->r_data_offset = obj_request->offset;
Alex Elder430c28c2013-04-03 21:32:51 -05001473}
1474
Ilya Dryomovbc812072017-01-25 18:16:23 +01001475static struct ceph_osd_request *
Ilya Dryomova162b302018-01-30 17:52:10 +01001476rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001477{
Ilya Dryomova162b302018-01-30 17:52:10 +01001478 struct rbd_img_request *img_req = obj_req->img_request;
1479 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001480 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1481 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001482 const char *name_format = rbd_dev->image_format == 1 ?
1483 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001484
Ilya Dryomova162b302018-01-30 17:52:10 +01001485 req = ceph_osdc_alloc_request(osdc,
1486 (rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1487 num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001488 if (!req)
1489 return NULL;
1490
Ilya Dryomovbc812072017-01-25 18:16:23 +01001491 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001492 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001493
1494 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001495 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomova162b302018-01-30 17:52:10 +01001496 rbd_dev->header.object_prefix, obj_req->object_no))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001497 goto err_req;
1498
1499 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1500 goto err_req;
1501
1502 return req;
1503
1504err_req:
1505 ceph_osdc_put_request(req);
1506 return NULL;
1507}
1508
Alex Elderbf0d5f502012-11-22 00:00:08 -06001509static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1510{
1511 ceph_osdc_put_request(osd_req);
1512}
1513
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001514static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001515{
1516 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001517
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001518 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001519 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001520 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001521
Alex Elderbf0d5f502012-11-22 00:00:08 -06001522 INIT_LIST_HEAD(&obj_request->links);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001523 kref_init(&obj_request->kref);
1524
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001525 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001526 return obj_request;
1527}
1528
1529static void rbd_obj_request_destroy(struct kref *kref)
1530{
1531 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001532 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001533
1534 obj_request = container_of(kref, struct rbd_obj_request, kref);
1535
Alex Elder37206ee2013-02-20 17:32:08 -06001536 dout("%s: obj %p\n", __func__, obj_request);
1537
Alex Elderbf0d5f502012-11-22 00:00:08 -06001538 if (obj_request->osd_req)
1539 rbd_osd_req_destroy(obj_request->osd_req);
1540
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001541 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001542 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001543 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001544 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001545 break; /* Nothing to do */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001546 default:
1547 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001548 }
1549
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001550 if (obj_request->copyup_bvecs) {
1551 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1552 if (obj_request->copyup_bvecs[i].bv_page)
1553 __free_page(obj_request->copyup_bvecs[i].bv_page);
1554 }
1555 kfree(obj_request->copyup_bvecs);
1556 }
Ilya Dryomovf9dcbc42018-01-20 10:30:11 +01001557
Alex Elder868311b2013-05-01 12:43:03 -05001558 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001559}
1560
Alex Elderfb65d2282013-05-08 22:50:04 -05001561/* It's OK to call this for a device with no parent */
1562
1563static void rbd_spec_put(struct rbd_spec *spec);
1564static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1565{
1566 rbd_dev_remove_parent(rbd_dev);
1567 rbd_spec_put(rbd_dev->parent_spec);
1568 rbd_dev->parent_spec = NULL;
1569 rbd_dev->parent_overlap = 0;
1570}
1571
Alex Elderbf0d5f502012-11-22 00:00:08 -06001572/*
Alex Eldera2acd002013-05-08 22:50:04 -05001573 * Parent image reference counting is used to determine when an
1574 * image's parent fields can be safely torn down--after there are no
1575 * more in-flight requests to the parent image. When the last
1576 * reference is dropped, cleaning them up is safe.
1577 */
1578static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1579{
1580 int counter;
1581
1582 if (!rbd_dev->parent_spec)
1583 return;
1584
1585 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1586 if (counter > 0)
1587 return;
1588
1589 /* Last reference; clean up parent data structures */
1590
1591 if (!counter)
1592 rbd_dev_unparent(rbd_dev);
1593 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001594 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001595}
1596
1597/*
1598 * If an image has a non-zero parent overlap, get a reference to its
1599 * parent.
1600 *
1601 * Returns true if the rbd device has a parent with a non-zero
1602 * overlap and a reference for it was successfully taken, or
1603 * false otherwise.
1604 */
1605static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1606{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001607 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001608
1609 if (!rbd_dev->parent_spec)
1610 return false;
1611
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001612 down_read(&rbd_dev->header_rwsem);
1613 if (rbd_dev->parent_overlap)
1614 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1615 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001616
1617 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001618 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001619
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001620 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001621}
1622
Alex Elderbf0d5f502012-11-22 00:00:08 -06001623/*
1624 * Caller is responsible for filling in the list of object requests
1625 * that comprises the image request, and the Linux request pointer
1626 * (if there is one).
1627 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001628static struct rbd_img_request *rbd_img_request_create(
1629 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001630 u64 offset, u64 length,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001631 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001632 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001633{
1634 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001635
Ilya Dryomova0c58952018-01-22 16:03:06 +01001636 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001637 if (!img_request)
1638 return NULL;
1639
Alex Elderbf0d5f502012-11-22 00:00:08 -06001640 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001641 img_request->op_type = op_type;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001642 img_request->offset = offset;
1643 img_request->length = length;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001644 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001645 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001646 else
1647 img_request->snapc = snapc;
1648
Alex Eldera2acd002013-05-08 22:50:04 -05001649 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001650 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001651
Alex Elderbf0d5f502012-11-22 00:00:08 -06001652 spin_lock_init(&img_request->completion_lock);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001653 INIT_LIST_HEAD(&img_request->obj_requests);
1654 kref_init(&img_request->kref);
1655
Alex Elder37206ee2013-02-20 17:32:08 -06001656 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001657 obj_op_name(op_type), offset, length, img_request);
Alex Elder37206ee2013-02-20 17:32:08 -06001658
Alex Elderbf0d5f502012-11-22 00:00:08 -06001659 return img_request;
1660}
1661
1662static void rbd_img_request_destroy(struct kref *kref)
1663{
1664 struct rbd_img_request *img_request;
1665 struct rbd_obj_request *obj_request;
1666 struct rbd_obj_request *next_obj_request;
1667
1668 img_request = container_of(kref, struct rbd_img_request, kref);
1669
Alex Elder37206ee2013-02-20 17:32:08 -06001670 dout("%s: img %p\n", __func__, img_request);
1671
Alex Elderbf0d5f502012-11-22 00:00:08 -06001672 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1673 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001674 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001675
Alex Eldera2acd002013-05-08 22:50:04 -05001676 if (img_request_layered_test(img_request)) {
1677 img_request_layered_clear(img_request);
1678 rbd_dev_parent_put(img_request->rbd_dev);
1679 }
1680
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001681 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001682 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001683
Alex Elder1c2a9df2013-05-01 12:43:03 -05001684 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001685}
1686
Alex Eldere93f3152013-05-08 22:50:04 -05001687static struct rbd_img_request *rbd_parent_request_create(
1688 struct rbd_obj_request *obj_request,
1689 u64 img_offset, u64 length)
1690{
1691 struct rbd_img_request *parent_request;
1692 struct rbd_device *rbd_dev;
1693
1694 rbd_assert(obj_request->img_request);
1695 rbd_dev = obj_request->img_request->rbd_dev;
1696
Josh Durgin4e752f02014-04-08 11:12:11 -07001697 parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001698 length, OBJ_OP_READ, NULL);
Alex Eldere93f3152013-05-08 22:50:04 -05001699 if (!parent_request)
1700 return NULL;
1701
1702 img_request_child_set(parent_request);
1703 rbd_obj_request_get(obj_request);
1704 parent_request->obj_request = obj_request;
1705
1706 return parent_request;
1707}
1708
1709static void rbd_parent_request_destroy(struct kref *kref)
1710{
1711 struct rbd_img_request *parent_request;
1712 struct rbd_obj_request *orig_request;
1713
1714 parent_request = container_of(kref, struct rbd_img_request, kref);
1715 orig_request = parent_request->obj_request;
1716
1717 parent_request->obj_request = NULL;
1718 rbd_obj_request_put(orig_request);
1719 img_request_child_clear(parent_request);
1720
1721 rbd_img_request_destroy(kref);
1722}
1723
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001724static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1725{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001726 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001727 case OBJ_REQUEST_BIO:
1728 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1729 &obj_req->bio_pos,
1730 obj_req->length);
1731 break;
1732 case OBJ_REQUEST_BVECS:
1733 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
1734 obj_req->length);
1735 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1736 &obj_req->bvec_pos);
1737 break;
1738 default:
1739 rbd_assert(0);
1740 }
1741}
1742
1743static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1744{
Ilya Dryomova162b302018-01-30 17:52:10 +01001745 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001746 if (!obj_req->osd_req)
1747 return -ENOMEM;
1748
1749 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
1750 obj_req->offset, obj_req->length, 0, 0);
1751 rbd_osd_req_setup_data(obj_req, 0);
1752
1753 rbd_osd_req_format_read(obj_req);
1754 return 0;
1755}
1756
1757static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1758 unsigned int which)
1759{
1760 struct page **pages;
1761
1762 /*
1763 * The response data for a STAT call consists of:
1764 * le64 length;
1765 * struct {
1766 * le32 tv_sec;
1767 * le32 tv_nsec;
1768 * } mtime;
1769 */
1770 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1771 if (IS_ERR(pages))
1772 return PTR_ERR(pages);
1773
1774 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1775 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1776 8 + sizeof(struct ceph_timespec),
1777 0, false, true);
1778 return 0;
1779}
1780
1781static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1782 unsigned int which)
1783{
1784 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1785 u16 opcode;
1786
1787 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
1788 rbd_dev->layout.object_size,
1789 rbd_dev->layout.object_size);
1790
1791 if (rbd_obj_is_entire(obj_req))
1792 opcode = CEPH_OSD_OP_WRITEFULL;
1793 else
1794 opcode = CEPH_OSD_OP_WRITE;
1795
1796 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
1797 obj_req->offset, obj_req->length, 0, 0);
1798 rbd_osd_req_setup_data(obj_req, which++);
1799
1800 rbd_assert(which == obj_req->osd_req->r_num_ops);
1801 rbd_osd_req_format_write(obj_req);
1802}
1803
1804static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
1805{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001806 unsigned int num_osd_ops, which = 0;
1807 int ret;
1808
1809 if (obj_request_overlaps_parent(obj_req)) {
1810 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1811 num_osd_ops = 3; /* stat + setallochint + write/writefull */
1812 } else {
1813 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1814 num_osd_ops = 2; /* setallochint + write/writefull */
1815 }
1816
Ilya Dryomova162b302018-01-30 17:52:10 +01001817 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001818 if (!obj_req->osd_req)
1819 return -ENOMEM;
1820
1821 if (obj_request_overlaps_parent(obj_req)) {
1822 ret = __rbd_obj_setup_stat(obj_req, which++);
1823 if (ret)
1824 return ret;
1825 }
1826
1827 __rbd_obj_setup_write(obj_req, which);
1828 return 0;
1829}
1830
1831static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
1832 unsigned int which)
1833{
1834 u16 opcode;
1835
1836 if (rbd_obj_is_entire(obj_req)) {
1837 if (obj_request_overlaps_parent(obj_req)) {
1838 opcode = CEPH_OSD_OP_TRUNCATE;
1839 } else {
1840 osd_req_op_init(obj_req->osd_req, which++,
1841 CEPH_OSD_OP_DELETE, 0);
1842 opcode = 0;
1843 }
1844 } else if (rbd_obj_is_tail(obj_req)) {
1845 opcode = CEPH_OSD_OP_TRUNCATE;
1846 } else {
1847 opcode = CEPH_OSD_OP_ZERO;
1848 }
1849
1850 if (opcode)
1851 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
1852 obj_req->offset, obj_req->length,
1853 0, 0);
1854
1855 rbd_assert(which == obj_req->osd_req->r_num_ops);
1856 rbd_osd_req_format_write(obj_req);
1857}
1858
1859static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1860{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001861 unsigned int num_osd_ops, which = 0;
1862 int ret;
1863
1864 if (rbd_obj_is_entire(obj_req)) {
1865 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1866 num_osd_ops = 1; /* truncate/delete */
1867 } else {
1868 if (obj_request_overlaps_parent(obj_req)) {
1869 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1870 num_osd_ops = 2; /* stat + truncate/zero */
1871 } else {
1872 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1873 num_osd_ops = 1; /* truncate/zero */
1874 }
1875 }
1876
Ilya Dryomova162b302018-01-30 17:52:10 +01001877 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001878 if (!obj_req->osd_req)
1879 return -ENOMEM;
1880
1881 if (!rbd_obj_is_entire(obj_req) &&
1882 obj_request_overlaps_parent(obj_req)) {
1883 ret = __rbd_obj_setup_stat(obj_req, which++);
1884 if (ret)
1885 return ret;
1886 }
1887
1888 __rbd_obj_setup_discard(obj_req, which);
1889 return 0;
1890}
1891
1892/*
1893 * For each object request in @img_req, allocate an OSD request, add
1894 * individual OSD ops and prepare them for submission. The number of
1895 * OSD ops depends on op_type and the overlap point (if any).
1896 */
1897static int __rbd_img_fill_request(struct rbd_img_request *img_req)
1898{
1899 struct rbd_obj_request *obj_req;
1900 int ret;
1901
1902 for_each_obj_request(img_req, obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001903 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001904 case OBJ_OP_READ:
1905 ret = rbd_obj_setup_read(obj_req);
1906 break;
1907 case OBJ_OP_WRITE:
1908 ret = rbd_obj_setup_write(obj_req);
1909 break;
1910 case OBJ_OP_DISCARD:
1911 ret = rbd_obj_setup_discard(obj_req);
1912 break;
1913 default:
1914 rbd_assert(0);
1915 }
1916 if (ret)
1917 return ret;
1918 }
1919
1920 return 0;
1921}
1922
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001923/*
Alex Elderf1a47392013-04-19 15:34:50 -05001924 * Split up an image request into one or more object requests, each
1925 * to a different object. The "type" parameter indicates whether
1926 * "data_desc" is the pointer to the head of a list of bio
1927 * structures, or the base of a page array. In either case this
1928 * function assumes data_desc describes memory sufficient to hold
1929 * all data described by the image request.
1930 */
1931static int rbd_img_request_fill(struct rbd_img_request *img_request,
1932 enum obj_request_type type,
1933 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001934{
1935 struct rbd_device *rbd_dev = img_request->rbd_dev;
1936 struct rbd_obj_request *obj_request = NULL;
1937 struct rbd_obj_request *next_obj_request;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001938 struct ceph_bio_iter bio_it;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001939 struct ceph_bvec_iter bvec_it;
Alex Elder7da22d22013-01-24 16:13:36 -06001940 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001941 u64 resid;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001942
Alex Elderf1a47392013-04-19 15:34:50 -05001943 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1944 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06001945
Alex Elder7da22d22013-01-24 16:13:36 -06001946 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001947 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06001948 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001949
1950 if (type == OBJ_REQUEST_BIO) {
Ilya Dryomov5359a172018-01-20 10:30:10 +01001951 bio_it = *(struct ceph_bio_iter *)data_desc;
Kent Overstreet4f024f32013-10-11 15:44:27 -07001952 rbd_assert(img_offset ==
Ilya Dryomov5359a172018-01-20 10:30:10 +01001953 bio_it.iter.bi_sector << SECTOR_SHIFT);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001954 } else if (type == OBJ_REQUEST_BVECS) {
1955 bvec_it = *(struct ceph_bvec_iter *)data_desc;
Alex Elderf1a47392013-04-19 15:34:50 -05001956 }
1957
Alex Elderbf0d5f502012-11-22 00:00:08 -06001958 while (resid) {
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001959 u64 object_no = img_offset >> rbd_dev->header.obj_order;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001960 u64 offset = rbd_segment_offset(rbd_dev, img_offset);
1961 u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001962
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001963 obj_request = rbd_obj_request_create();
Alex Elderbf0d5f502012-11-22 00:00:08 -06001964 if (!obj_request)
1965 goto out_unwind;
Ilya Dryomov62054da2014-03-04 11:57:17 +02001966
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001967 obj_request->object_no = object_no;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001968 obj_request->offset = offset;
1969 obj_request->length = length;
1970
Josh Durgin03507db2013-08-27 14:45:46 -07001971 /*
1972 * set obj_request->img_request before creating the
1973 * osd_request so that it gets the right snapc
1974 */
1975 rbd_img_obj_request_add(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001976
Alex Elderf1a47392013-04-19 15:34:50 -05001977 if (type == OBJ_REQUEST_BIO) {
Ilya Dryomov5359a172018-01-20 10:30:10 +01001978 obj_request->bio_pos = bio_it;
1979 ceph_bio_iter_advance(&bio_it, length);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001980 } else if (type == OBJ_REQUEST_BVECS) {
1981 obj_request->bvec_pos = bvec_it;
1982 ceph_bvec_iter_shorten(&obj_request->bvec_pos, length);
1983 ceph_bvec_iter_advance(&bvec_it, length);
Alex Elderf1a47392013-04-19 15:34:50 -05001984 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001985
Alex Elder7da22d22013-01-24 16:13:36 -06001986 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001987
Alex Elder7da22d22013-01-24 16:13:36 -06001988 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001989 resid -= length;
1990 }
1991
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001992 img_request->data_type = type;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001993 return __rbd_img_fill_request(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001994
Alex Elderbf0d5f502012-11-22 00:00:08 -06001995out_unwind:
1996 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
Ilya Dryomov42dd0372014-03-04 11:57:17 +02001997 rbd_img_obj_request_del(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001998
1999 return -ENOMEM;
2000}
2001
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002002static void rbd_img_request_submit(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002003{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002004 struct rbd_obj_request *obj_request;
2005
Alex Elder37206ee2013-02-20 17:32:08 -06002006 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002007
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002008 rbd_img_request_get(img_request);
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002009 for_each_obj_request(img_request, obj_request)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002010 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002011
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002012 rbd_img_request_put(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002013}
2014
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002015static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req,
2016 u64 img_offset, u32 bytes)
2017{
2018 struct rbd_img_request *img_req = obj_req->img_request;
2019 struct rbd_img_request *child_img_req;
2020 int ret;
2021
2022 child_img_req = rbd_parent_request_create(obj_req, img_offset, bytes);
2023 if (!child_img_req)
2024 return -ENOMEM;
2025
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002026 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002027 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002028 case OBJ_REQUEST_BIO:
2029 ret = rbd_img_request_fill(child_img_req,
2030 OBJ_REQUEST_BIO,
2031 &obj_req->bio_pos);
2032 break;
2033 case OBJ_REQUEST_BVECS:
2034 ret = rbd_img_request_fill(child_img_req,
2035 OBJ_REQUEST_BVECS,
2036 &obj_req->bvec_pos);
2037 break;
2038 default:
2039 rbd_assert(0);
2040 }
2041 } else {
2042 struct ceph_bvec_iter it = {
2043 .bvecs = obj_req->copyup_bvecs,
2044 .iter = { .bi_size = bytes },
2045 };
2046
2047 ret = rbd_img_request_fill(child_img_req, OBJ_REQUEST_BVECS,
2048 &it);
2049 }
2050 if (ret) {
2051 rbd_img_request_put(child_img_req);
2052 return ret;
2053 }
2054
2055 rbd_img_request_submit(child_img_req);
2056 return 0;
2057}
2058
2059static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2060{
2061 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2062 int ret;
2063
2064 if (obj_req->result == -ENOENT &&
2065 obj_req->img_offset < rbd_dev->parent_overlap &&
2066 !obj_req->tried_parent) {
2067 u64 obj_overlap = min(obj_req->length,
2068 rbd_dev->parent_overlap - obj_req->img_offset);
2069
2070 obj_req->tried_parent = true;
2071 ret = rbd_obj_read_from_parent(obj_req, obj_req->img_offset,
2072 obj_overlap);
2073 if (ret) {
2074 obj_req->result = ret;
2075 return true;
2076 }
2077 return false;
2078 }
2079
2080 /*
2081 * -ENOENT means a hole in the image -- zero-fill the entire
2082 * length of the request. A short read also implies zero-fill
2083 * to the end of the request. In both cases we update xferred
2084 * count to indicate the whole request was satisfied.
2085 */
2086 if (obj_req->result == -ENOENT ||
2087 (!obj_req->result && obj_req->xferred < obj_req->length)) {
2088 rbd_assert(!obj_req->xferred || !obj_req->result);
2089 rbd_obj_zero_range(obj_req, obj_req->xferred,
2090 obj_req->length - obj_req->xferred);
2091 obj_req->result = 0;
2092 obj_req->xferred = obj_req->length;
2093 }
2094
2095 return true;
2096}
2097
2098/*
2099 * copyup_bvecs pages are never highmem pages
2100 */
2101static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2102{
2103 struct ceph_bvec_iter it = {
2104 .bvecs = bvecs,
2105 .iter = { .bi_size = bytes },
2106 };
2107
2108 ceph_bvec_iter_advance_step(&it, bytes, ({
2109 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2110 bv.bv_len))
2111 return false;
2112 }));
2113 return true;
2114}
2115
2116static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2117{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002118 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
2119
2120 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2121 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2122 rbd_osd_req_destroy(obj_req->osd_req);
2123
2124 /*
2125 * Create a copyup request with the same number of OSD ops as
2126 * the original request. The original request was stat + op(s),
2127 * the new copyup request will be copyup + the same op(s).
2128 */
Ilya Dryomova162b302018-01-30 17:52:10 +01002129 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002130 if (!obj_req->osd_req)
2131 return -ENOMEM;
2132
2133 /*
2134 * Only send non-zero copyup data to save some I/O and network
2135 * bandwidth -- zero copyup data is equivalent to the object not
2136 * existing.
2137 */
2138 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2139 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2140 bytes = 0;
2141 }
2142
2143 osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
2144 "copyup");
2145 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
2146 obj_req->copyup_bvecs, bytes);
2147
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002148 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002149 case OBJ_OP_WRITE:
2150 __rbd_obj_setup_write(obj_req, 1);
2151 break;
2152 case OBJ_OP_DISCARD:
2153 rbd_assert(!rbd_obj_is_entire(obj_req));
2154 __rbd_obj_setup_discard(obj_req, 1);
2155 break;
2156 default:
2157 rbd_assert(0);
2158 }
2159
2160 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002161 return 0;
2162}
2163
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002164static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2165{
2166 u32 i;
2167
2168 rbd_assert(!obj_req->copyup_bvecs);
2169 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2170 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2171 sizeof(*obj_req->copyup_bvecs),
2172 GFP_NOIO);
2173 if (!obj_req->copyup_bvecs)
2174 return -ENOMEM;
2175
2176 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2177 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2178
2179 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2180 if (!obj_req->copyup_bvecs[i].bv_page)
2181 return -ENOMEM;
2182
2183 obj_req->copyup_bvecs[i].bv_offset = 0;
2184 obj_req->copyup_bvecs[i].bv_len = len;
2185 obj_overlap -= len;
2186 }
2187
2188 rbd_assert(!obj_overlap);
2189 return 0;
2190}
2191
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002192static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2193{
2194 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2195 u64 img_offset;
2196 u64 obj_overlap;
2197 int ret;
2198
2199 if (!obj_request_overlaps_parent(obj_req)) {
2200 /*
2201 * The overlap has become 0 (most likely because the
2202 * image has been flattened). Use rbd_obj_issue_copyup()
2203 * to re-submit the original write request -- the copyup
2204 * operation itself will be a no-op, since someone must
2205 * have populated the child object while we weren't
2206 * looking. Move to WRITE_FLAT state as we'll be done
2207 * with the operation once the null copyup completes.
2208 */
2209 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2210 return rbd_obj_issue_copyup(obj_req, 0);
2211 }
2212
2213 /*
2214 * Determine the byte range covered by the object in the
2215 * child image to which the original request was to be sent.
2216 */
2217 img_offset = obj_req->img_offset - obj_req->offset;
2218 obj_overlap = rbd_dev->layout.object_size;
2219
2220 /*
2221 * There is no defined parent data beyond the parent
2222 * overlap, so limit what we read at that boundary if
2223 * necessary.
2224 */
2225 if (img_offset + obj_overlap > rbd_dev->parent_overlap) {
2226 rbd_assert(img_offset < rbd_dev->parent_overlap);
2227 obj_overlap = rbd_dev->parent_overlap - img_offset;
2228 }
2229
2230 ret = setup_copyup_bvecs(obj_req, obj_overlap);
2231 if (ret)
2232 return ret;
2233
2234 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
2235 return rbd_obj_read_from_parent(obj_req, img_offset, obj_overlap);
2236}
2237
2238static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2239{
2240 int ret;
2241
2242again:
2243 switch (obj_req->write_state) {
2244 case RBD_OBJ_WRITE_GUARD:
2245 rbd_assert(!obj_req->xferred);
2246 if (obj_req->result == -ENOENT) {
2247 /*
2248 * The target object doesn't exist. Read the data for
2249 * the entire target object up to the overlap point (if
2250 * any) from the parent, so we can use it for a copyup.
2251 */
2252 ret = rbd_obj_handle_write_guard(obj_req);
2253 if (ret) {
2254 obj_req->result = ret;
2255 return true;
2256 }
2257 return false;
2258 }
2259 /* fall through */
2260 case RBD_OBJ_WRITE_FLAT:
2261 if (!obj_req->result)
2262 /*
2263 * There is no such thing as a successful short
2264 * write -- indicate the whole request was satisfied.
2265 */
2266 obj_req->xferred = obj_req->length;
2267 return true;
2268 case RBD_OBJ_WRITE_COPYUP:
2269 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2270 if (obj_req->result)
2271 goto again;
2272
2273 rbd_assert(obj_req->xferred);
2274 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2275 if (ret) {
2276 obj_req->result = ret;
2277 return true;
2278 }
2279 return false;
2280 default:
2281 rbd_assert(0);
2282 }
2283}
2284
2285/*
2286 * Returns true if @obj_req is completed, or false otherwise.
2287 */
2288static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2289{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002290 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002291 case OBJ_OP_READ:
2292 return rbd_obj_handle_read(obj_req);
2293 case OBJ_OP_WRITE:
2294 return rbd_obj_handle_write(obj_req);
2295 case OBJ_OP_DISCARD:
2296 if (rbd_obj_handle_write(obj_req)) {
2297 /*
2298 * Hide -ENOENT from delete/truncate/zero -- discarding
2299 * a non-existent object is not a problem.
2300 */
2301 if (obj_req->result == -ENOENT) {
2302 obj_req->result = 0;
2303 obj_req->xferred = obj_req->length;
2304 }
2305 return true;
2306 }
2307 return false;
2308 default:
2309 rbd_assert(0);
2310 }
2311}
2312
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002313static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2314{
2315 struct rbd_img_request *img_req = obj_req->img_request;
2316
2317 rbd_assert((!obj_req->result &&
2318 obj_req->xferred == obj_req->length) ||
2319 (obj_req->result < 0 && !obj_req->xferred));
2320 if (!obj_req->result) {
2321 img_req->xferred += obj_req->xferred;
2322 return;
2323 }
2324
2325 rbd_warn(img_req->rbd_dev,
2326 "%s at objno %llu %llu~%llu result %d xferred %llu",
2327 obj_op_name(img_req->op_type), obj_req->object_no,
2328 obj_req->offset, obj_req->length, obj_req->result,
2329 obj_req->xferred);
2330 if (!img_req->result) {
2331 img_req->result = obj_req->result;
2332 img_req->xferred = 0;
2333 }
2334}
2335
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002336static void rbd_img_end_child_request(struct rbd_img_request *img_req)
2337{
2338 struct rbd_obj_request *obj_req = img_req->obj_request;
2339
2340 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
2341
2342 obj_req->result = img_req->result;
2343 obj_req->xferred = img_req->xferred;
2344 rbd_img_request_put(img_req);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002345}
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002346
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002347static void rbd_img_end_request(struct rbd_img_request *img_req)
2348{
2349 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2350 rbd_assert((!img_req->result &&
2351 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2352 (img_req->result < 0 && !img_req->xferred));
2353
2354 blk_mq_end_request(img_req->rq,
2355 errno_to_blk_status(img_req->result));
2356 rbd_img_request_put(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002357}
2358
2359static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2360{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002361 struct rbd_img_request *img_req;
2362
2363again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002364 if (!__rbd_obj_handle_request(obj_req))
2365 return;
2366
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002367 img_req = obj_req->img_request;
2368 spin_lock(&img_req->completion_lock);
2369 rbd_obj_end_request(obj_req);
2370 rbd_assert(img_req->pending_count);
2371 if (--img_req->pending_count) {
2372 spin_unlock(&img_req->completion_lock);
2373 return;
2374 }
2375
2376 spin_unlock(&img_req->completion_lock);
2377 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2378 obj_req = img_req->obj_request;
2379 rbd_img_end_child_request(img_req);
2380 goto again;
2381 }
2382 rbd_img_end_request(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002383}
2384
Ilya Dryomoved95b212016-08-12 16:40:02 +02002385static const struct rbd_client_id rbd_empty_cid;
2386
2387static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2388 const struct rbd_client_id *rhs)
2389{
2390 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2391}
2392
2393static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2394{
2395 struct rbd_client_id cid;
2396
2397 mutex_lock(&rbd_dev->watch_mutex);
2398 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2399 cid.handle = rbd_dev->watch_cookie;
2400 mutex_unlock(&rbd_dev->watch_mutex);
2401 return cid;
2402}
2403
2404/*
2405 * lock_rwsem must be held for write
2406 */
2407static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2408 const struct rbd_client_id *cid)
2409{
2410 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2411 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2412 cid->gid, cid->handle);
2413 rbd_dev->owner_cid = *cid; /* struct */
2414}
2415
2416static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2417{
2418 mutex_lock(&rbd_dev->watch_mutex);
2419 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2420 mutex_unlock(&rbd_dev->watch_mutex);
2421}
2422
Florian Margaineedd8ca82017-12-13 16:43:59 +01002423static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2424{
2425 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2426
2427 strcpy(rbd_dev->lock_cookie, cookie);
2428 rbd_set_owner_cid(rbd_dev, &cid);
2429 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2430}
2431
Ilya Dryomoved95b212016-08-12 16:40:02 +02002432/*
2433 * lock_rwsem must be held for write
2434 */
2435static int rbd_lock(struct rbd_device *rbd_dev)
2436{
2437 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002438 char cookie[32];
2439 int ret;
2440
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002441 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2442 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002443
2444 format_lock_cookie(rbd_dev, cookie);
2445 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2446 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2447 RBD_LOCK_TAG, "", 0);
2448 if (ret)
2449 return ret;
2450
2451 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002452 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002453 return 0;
2454}
2455
2456/*
2457 * lock_rwsem must be held for write
2458 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002459static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002460{
2461 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002462 int ret;
2463
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002464 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2465 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002466
Ilya Dryomoved95b212016-08-12 16:40:02 +02002467 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002468 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002469 if (ret && ret != -ENOENT)
2470 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002471
Ilya Dryomovbbead742017-04-13 12:17:38 +02002472 /* treat errors as the image is unlocked */
2473 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002474 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002475 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2476 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002477}
2478
2479static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2480 enum rbd_notify_op notify_op,
2481 struct page ***preply_pages,
2482 size_t *preply_len)
2483{
2484 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2485 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2486 int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
2487 char buf[buf_size];
2488 void *p = buf;
2489
2490 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2491
2492 /* encode *LockPayload NotifyMessage (op + ClientId) */
2493 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2494 ceph_encode_32(&p, notify_op);
2495 ceph_encode_64(&p, cid.gid);
2496 ceph_encode_64(&p, cid.handle);
2497
2498 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2499 &rbd_dev->header_oloc, buf, buf_size,
2500 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2501}
2502
2503static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2504 enum rbd_notify_op notify_op)
2505{
2506 struct page **reply_pages;
2507 size_t reply_len;
2508
2509 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2510 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2511}
2512
2513static void rbd_notify_acquired_lock(struct work_struct *work)
2514{
2515 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2516 acquired_lock_work);
2517
2518 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2519}
2520
2521static void rbd_notify_released_lock(struct work_struct *work)
2522{
2523 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2524 released_lock_work);
2525
2526 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2527}
2528
2529static int rbd_request_lock(struct rbd_device *rbd_dev)
2530{
2531 struct page **reply_pages;
2532 size_t reply_len;
2533 bool lock_owner_responded = false;
2534 int ret;
2535
2536 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2537
2538 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2539 &reply_pages, &reply_len);
2540 if (ret && ret != -ETIMEDOUT) {
2541 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2542 goto out;
2543 }
2544
2545 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2546 void *p = page_address(reply_pages[0]);
2547 void *const end = p + reply_len;
2548 u32 n;
2549
2550 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2551 while (n--) {
2552 u8 struct_v;
2553 u32 len;
2554
2555 ceph_decode_need(&p, end, 8 + 8, e_inval);
2556 p += 8 + 8; /* skip gid and cookie */
2557
2558 ceph_decode_32_safe(&p, end, len, e_inval);
2559 if (!len)
2560 continue;
2561
2562 if (lock_owner_responded) {
2563 rbd_warn(rbd_dev,
2564 "duplicate lock owners detected");
2565 ret = -EIO;
2566 goto out;
2567 }
2568
2569 lock_owner_responded = true;
2570 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2571 &struct_v, &len);
2572 if (ret) {
2573 rbd_warn(rbd_dev,
2574 "failed to decode ResponseMessage: %d",
2575 ret);
2576 goto e_inval;
2577 }
2578
2579 ret = ceph_decode_32(&p);
2580 }
2581 }
2582
2583 if (!lock_owner_responded) {
2584 rbd_warn(rbd_dev, "no lock owners detected");
2585 ret = -ETIMEDOUT;
2586 }
2587
2588out:
2589 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2590 return ret;
2591
2592e_inval:
2593 ret = -EINVAL;
2594 goto out;
2595}
2596
2597static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2598{
2599 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2600
2601 cancel_delayed_work(&rbd_dev->lock_dwork);
2602 if (wake_all)
2603 wake_up_all(&rbd_dev->lock_waitq);
2604 else
2605 wake_up(&rbd_dev->lock_waitq);
2606}
2607
2608static int get_lock_owner_info(struct rbd_device *rbd_dev,
2609 struct ceph_locker **lockers, u32 *num_lockers)
2610{
2611 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2612 u8 lock_type;
2613 char *lock_tag;
2614 int ret;
2615
2616 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2617
2618 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2619 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2620 &lock_type, &lock_tag, lockers, num_lockers);
2621 if (ret)
2622 return ret;
2623
2624 if (*num_lockers == 0) {
2625 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2626 goto out;
2627 }
2628
2629 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2630 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2631 lock_tag);
2632 ret = -EBUSY;
2633 goto out;
2634 }
2635
2636 if (lock_type == CEPH_CLS_LOCK_SHARED) {
2637 rbd_warn(rbd_dev, "shared lock type detected");
2638 ret = -EBUSY;
2639 goto out;
2640 }
2641
2642 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2643 strlen(RBD_LOCK_COOKIE_PREFIX))) {
2644 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2645 (*lockers)[0].id.cookie);
2646 ret = -EBUSY;
2647 goto out;
2648 }
2649
2650out:
2651 kfree(lock_tag);
2652 return ret;
2653}
2654
2655static int find_watcher(struct rbd_device *rbd_dev,
2656 const struct ceph_locker *locker)
2657{
2658 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2659 struct ceph_watch_item *watchers;
2660 u32 num_watchers;
2661 u64 cookie;
2662 int i;
2663 int ret;
2664
2665 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2666 &rbd_dev->header_oloc, &watchers,
2667 &num_watchers);
2668 if (ret)
2669 return ret;
2670
2671 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2672 for (i = 0; i < num_watchers; i++) {
2673 if (!memcmp(&watchers[i].addr, &locker->info.addr,
2674 sizeof(locker->info.addr)) &&
2675 watchers[i].cookie == cookie) {
2676 struct rbd_client_id cid = {
2677 .gid = le64_to_cpu(watchers[i].name.num),
2678 .handle = cookie,
2679 };
2680
2681 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
2682 rbd_dev, cid.gid, cid.handle);
2683 rbd_set_owner_cid(rbd_dev, &cid);
2684 ret = 1;
2685 goto out;
2686 }
2687 }
2688
2689 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
2690 ret = 0;
2691out:
2692 kfree(watchers);
2693 return ret;
2694}
2695
2696/*
2697 * lock_rwsem must be held for write
2698 */
2699static int rbd_try_lock(struct rbd_device *rbd_dev)
2700{
2701 struct ceph_client *client = rbd_dev->rbd_client->client;
2702 struct ceph_locker *lockers;
2703 u32 num_lockers;
2704 int ret;
2705
2706 for (;;) {
2707 ret = rbd_lock(rbd_dev);
2708 if (ret != -EBUSY)
2709 return ret;
2710
2711 /* determine if the current lock holder is still alive */
2712 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
2713 if (ret)
2714 return ret;
2715
2716 if (num_lockers == 0)
2717 goto again;
2718
2719 ret = find_watcher(rbd_dev, lockers);
2720 if (ret) {
2721 if (ret > 0)
2722 ret = 0; /* have to request lock */
2723 goto out;
2724 }
2725
2726 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
2727 ENTITY_NAME(lockers[0].id.name));
2728
2729 ret = ceph_monc_blacklist_add(&client->monc,
2730 &lockers[0].info.addr);
2731 if (ret) {
2732 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
2733 ENTITY_NAME(lockers[0].id.name), ret);
2734 goto out;
2735 }
2736
2737 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
2738 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2739 lockers[0].id.cookie,
2740 &lockers[0].id.name);
2741 if (ret && ret != -ENOENT)
2742 goto out;
2743
2744again:
2745 ceph_free_lockers(lockers, num_lockers);
2746 }
2747
2748out:
2749 ceph_free_lockers(lockers, num_lockers);
2750 return ret;
2751}
2752
2753/*
2754 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
2755 */
2756static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
2757 int *pret)
2758{
2759 enum rbd_lock_state lock_state;
2760
2761 down_read(&rbd_dev->lock_rwsem);
2762 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
2763 rbd_dev->lock_state);
2764 if (__rbd_is_lock_owner(rbd_dev)) {
2765 lock_state = rbd_dev->lock_state;
2766 up_read(&rbd_dev->lock_rwsem);
2767 return lock_state;
2768 }
2769
2770 up_read(&rbd_dev->lock_rwsem);
2771 down_write(&rbd_dev->lock_rwsem);
2772 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
2773 rbd_dev->lock_state);
2774 if (!__rbd_is_lock_owner(rbd_dev)) {
2775 *pret = rbd_try_lock(rbd_dev);
2776 if (*pret)
2777 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
2778 }
2779
2780 lock_state = rbd_dev->lock_state;
2781 up_write(&rbd_dev->lock_rwsem);
2782 return lock_state;
2783}
2784
2785static void rbd_acquire_lock(struct work_struct *work)
2786{
2787 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
2788 struct rbd_device, lock_dwork);
2789 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08002790 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002791
2792 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2793again:
2794 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
2795 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
2796 if (lock_state == RBD_LOCK_STATE_LOCKED)
2797 wake_requests(rbd_dev, true);
2798 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
2799 rbd_dev, lock_state, ret);
2800 return;
2801 }
2802
2803 ret = rbd_request_lock(rbd_dev);
2804 if (ret == -ETIMEDOUT) {
2805 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02002806 } else if (ret == -EROFS) {
2807 rbd_warn(rbd_dev, "peer will not release lock");
2808 /*
2809 * If this is rbd_add_acquire_lock(), we want to fail
2810 * immediately -- reuse BLACKLISTED flag. Otherwise we
2811 * want to block.
2812 */
2813 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
2814 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
2815 /* wake "rbd map --exclusive" process */
2816 wake_requests(rbd_dev, false);
2817 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02002818 } else if (ret < 0) {
2819 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
2820 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
2821 RBD_RETRY_DELAY);
2822 } else {
2823 /*
2824 * lock owner acked, but resend if we don't see them
2825 * release the lock
2826 */
2827 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
2828 rbd_dev);
2829 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
2830 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
2831 }
2832}
2833
2834/*
2835 * lock_rwsem must be held for write
2836 */
2837static bool rbd_release_lock(struct rbd_device *rbd_dev)
2838{
2839 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
2840 rbd_dev->lock_state);
2841 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
2842 return false;
2843
2844 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
2845 downgrade_write(&rbd_dev->lock_rwsem);
2846 /*
2847 * Ensure that all in-flight IO is flushed.
2848 *
2849 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
2850 * may be shared with other devices.
2851 */
2852 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
2853 up_read(&rbd_dev->lock_rwsem);
2854
2855 down_write(&rbd_dev->lock_rwsem);
2856 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
2857 rbd_dev->lock_state);
2858 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
2859 return false;
2860
Ilya Dryomovbbead742017-04-13 12:17:38 +02002861 rbd_unlock(rbd_dev);
2862 /*
2863 * Give others a chance to grab the lock - we would re-acquire
2864 * almost immediately if we got new IO during ceph_osdc_sync()
2865 * otherwise. We need to ack our own notifications, so this
2866 * lock_dwork will be requeued from rbd_wait_state_locked()
2867 * after wake_requests() in rbd_handle_released_lock().
2868 */
2869 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002870 return true;
2871}
2872
2873static void rbd_release_lock_work(struct work_struct *work)
2874{
2875 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2876 unlock_work);
2877
2878 down_write(&rbd_dev->lock_rwsem);
2879 rbd_release_lock(rbd_dev);
2880 up_write(&rbd_dev->lock_rwsem);
2881}
2882
2883static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
2884 void **p)
2885{
2886 struct rbd_client_id cid = { 0 };
2887
2888 if (struct_v >= 2) {
2889 cid.gid = ceph_decode_64(p);
2890 cid.handle = ceph_decode_64(p);
2891 }
2892
2893 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
2894 cid.handle);
2895 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
2896 down_write(&rbd_dev->lock_rwsem);
2897 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
2898 /*
2899 * we already know that the remote client is
2900 * the owner
2901 */
2902 up_write(&rbd_dev->lock_rwsem);
2903 return;
2904 }
2905
2906 rbd_set_owner_cid(rbd_dev, &cid);
2907 downgrade_write(&rbd_dev->lock_rwsem);
2908 } else {
2909 down_read(&rbd_dev->lock_rwsem);
2910 }
2911
2912 if (!__rbd_is_lock_owner(rbd_dev))
2913 wake_requests(rbd_dev, false);
2914 up_read(&rbd_dev->lock_rwsem);
2915}
2916
2917static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
2918 void **p)
2919{
2920 struct rbd_client_id cid = { 0 };
2921
2922 if (struct_v >= 2) {
2923 cid.gid = ceph_decode_64(p);
2924 cid.handle = ceph_decode_64(p);
2925 }
2926
2927 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
2928 cid.handle);
2929 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
2930 down_write(&rbd_dev->lock_rwsem);
2931 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
2932 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
2933 __func__, rbd_dev, cid.gid, cid.handle,
2934 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
2935 up_write(&rbd_dev->lock_rwsem);
2936 return;
2937 }
2938
2939 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2940 downgrade_write(&rbd_dev->lock_rwsem);
2941 } else {
2942 down_read(&rbd_dev->lock_rwsem);
2943 }
2944
2945 if (!__rbd_is_lock_owner(rbd_dev))
2946 wake_requests(rbd_dev, false);
2947 up_read(&rbd_dev->lock_rwsem);
2948}
2949
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02002950/*
2951 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
2952 * ResponseMessage is needed.
2953 */
2954static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
2955 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002956{
2957 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
2958 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02002959 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002960
2961 if (struct_v >= 2) {
2962 cid.gid = ceph_decode_64(p);
2963 cid.handle = ceph_decode_64(p);
2964 }
2965
2966 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
2967 cid.handle);
2968 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02002969 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002970
2971 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02002972 if (__rbd_is_lock_owner(rbd_dev)) {
2973 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
2974 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
2975 goto out_unlock;
2976
2977 /*
2978 * encode ResponseMessage(0) so the peer can detect
2979 * a missing owner
2980 */
2981 result = 0;
2982
2983 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02002984 if (!rbd_dev->opts->exclusive) {
2985 dout("%s rbd_dev %p queueing unlock_work\n",
2986 __func__, rbd_dev);
2987 queue_work(rbd_dev->task_wq,
2988 &rbd_dev->unlock_work);
2989 } else {
2990 /* refuse to release the lock */
2991 result = -EROFS;
2992 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02002993 }
2994 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02002995
2996out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02002997 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02002998 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002999}
3000
3001static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3002 u64 notify_id, u64 cookie, s32 *result)
3003{
3004 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3005 int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3006 char buf[buf_size];
3007 int ret;
3008
3009 if (result) {
3010 void *p = buf;
3011
3012 /* encode ResponseMessage */
3013 ceph_start_encoding(&p, 1, 1,
3014 buf_size - CEPH_ENCODING_START_BLK_LEN);
3015 ceph_encode_32(&p, *result);
3016 } else {
3017 buf_size = 0;
3018 }
3019
3020 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3021 &rbd_dev->header_oloc, notify_id, cookie,
3022 buf, buf_size);
3023 if (ret)
3024 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3025}
3026
3027static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3028 u64 cookie)
3029{
3030 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3031 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3032}
3033
3034static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3035 u64 notify_id, u64 cookie, s32 result)
3036{
3037 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3038 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3039}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003040
3041static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3042 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003043{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003044 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003045 void *p = data;
3046 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003047 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003048 u32 len;
3049 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003050 int ret;
3051
Ilya Dryomoved95b212016-08-12 16:40:02 +02003052 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3053 __func__, rbd_dev, cookie, notify_id, data_len);
3054 if (data_len) {
3055 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3056 &struct_v, &len);
3057 if (ret) {
3058 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3059 ret);
3060 return;
3061 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003062
Ilya Dryomoved95b212016-08-12 16:40:02 +02003063 notify_op = ceph_decode_32(&p);
3064 } else {
3065 /* legacy notification for header updates */
3066 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3067 len = 0;
3068 }
Alex Elderb8d70032012-11-30 17:53:04 -06003069
Ilya Dryomoved95b212016-08-12 16:40:02 +02003070 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3071 switch (notify_op) {
3072 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3073 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3074 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3075 break;
3076 case RBD_NOTIFY_OP_RELEASED_LOCK:
3077 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3078 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3079 break;
3080 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003081 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3082 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003083 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003084 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003085 else
3086 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3087 break;
3088 case RBD_NOTIFY_OP_HEADER_UPDATE:
3089 ret = rbd_dev_refresh(rbd_dev);
3090 if (ret)
3091 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3092
3093 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3094 break;
3095 default:
3096 if (rbd_is_lock_owner(rbd_dev))
3097 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3098 cookie, -EOPNOTSUPP);
3099 else
3100 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3101 break;
3102 }
Alex Elderb8d70032012-11-30 17:53:04 -06003103}
3104
Ilya Dryomov99d16942016-08-12 16:11:41 +02003105static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3106
Ilya Dryomov922dab62016-05-26 01:15:02 +02003107static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003108{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003109 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003110
Ilya Dryomov922dab62016-05-26 01:15:02 +02003111 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003112
Ilya Dryomoved95b212016-08-12 16:40:02 +02003113 down_write(&rbd_dev->lock_rwsem);
3114 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3115 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003116
Ilya Dryomov99d16942016-08-12 16:11:41 +02003117 mutex_lock(&rbd_dev->watch_mutex);
3118 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3119 __rbd_unregister_watch(rbd_dev);
3120 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003121
Ilya Dryomov99d16942016-08-12 16:11:41 +02003122 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003123 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003124 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003125}
3126
3127/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003128 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003129 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003130static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003131{
3132 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003133 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003134
Ilya Dryomov922dab62016-05-26 01:15:02 +02003135 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003136 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003137
Ilya Dryomov922dab62016-05-26 01:15:02 +02003138 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3139 &rbd_dev->header_oloc, rbd_watch_cb,
3140 rbd_watch_errcb, rbd_dev);
3141 if (IS_ERR(handle))
3142 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003143
Ilya Dryomov922dab62016-05-26 01:15:02 +02003144 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003145 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003146}
3147
Ilya Dryomov99d16942016-08-12 16:11:41 +02003148/*
3149 * watch_mutex must be locked
3150 */
3151static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003152{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003153 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3154 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003155
Ilya Dryomov99d16942016-08-12 16:11:41 +02003156 rbd_assert(rbd_dev->watch_handle);
3157 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003158
Ilya Dryomov922dab62016-05-26 01:15:02 +02003159 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3160 if (ret)
3161 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003162
Ilya Dryomov922dab62016-05-26 01:15:02 +02003163 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003164}
3165
Ilya Dryomov99d16942016-08-12 16:11:41 +02003166static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003167{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003168 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003169
Ilya Dryomov99d16942016-08-12 16:11:41 +02003170 mutex_lock(&rbd_dev->watch_mutex);
3171 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3172 ret = __rbd_register_watch(rbd_dev);
3173 if (ret)
3174 goto out;
3175
3176 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3177 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3178
3179out:
3180 mutex_unlock(&rbd_dev->watch_mutex);
3181 return ret;
3182}
3183
3184static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3185{
3186 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3187
3188 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003189 cancel_work_sync(&rbd_dev->acquired_lock_work);
3190 cancel_work_sync(&rbd_dev->released_lock_work);
3191 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3192 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003193}
3194
3195static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3196{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003197 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003198 cancel_tasks_sync(rbd_dev);
3199
3200 mutex_lock(&rbd_dev->watch_mutex);
3201 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3202 __rbd_unregister_watch(rbd_dev);
3203 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3204 mutex_unlock(&rbd_dev->watch_mutex);
3205
Ilya Dryomov811c6682016-04-15 16:22:16 +02003206 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003207}
3208
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003209/*
3210 * lock_rwsem must be held for write
3211 */
3212static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3213{
3214 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3215 char cookie[32];
3216 int ret;
3217
3218 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3219
3220 format_lock_cookie(rbd_dev, cookie);
3221 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3222 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3223 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3224 RBD_LOCK_TAG, cookie);
3225 if (ret) {
3226 if (ret != -EOPNOTSUPP)
3227 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3228 ret);
3229
3230 /*
3231 * Lock cookie cannot be updated on older OSDs, so do
3232 * a manual release and queue an acquire.
3233 */
3234 if (rbd_release_lock(rbd_dev))
3235 queue_delayed_work(rbd_dev->task_wq,
3236 &rbd_dev->lock_dwork, 0);
3237 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003238 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003239 }
3240}
3241
Ilya Dryomov99d16942016-08-12 16:11:41 +02003242static void rbd_reregister_watch(struct work_struct *work)
3243{
3244 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3245 struct rbd_device, watch_dwork);
3246 int ret;
3247
3248 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3249
3250 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003251 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3252 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003253 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003254 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003255
3256 ret = __rbd_register_watch(rbd_dev);
3257 if (ret) {
3258 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003259 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003260 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003261 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003262 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003263 queue_delayed_work(rbd_dev->task_wq,
3264 &rbd_dev->watch_dwork,
3265 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003266 }
3267 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003268 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003269 }
3270
3271 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3272 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3273 mutex_unlock(&rbd_dev->watch_mutex);
3274
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003275 down_write(&rbd_dev->lock_rwsem);
3276 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3277 rbd_reacquire_lock(rbd_dev);
3278 up_write(&rbd_dev->lock_rwsem);
3279
Ilya Dryomov99d16942016-08-12 16:11:41 +02003280 ret = rbd_dev_refresh(rbd_dev);
3281 if (ret)
3282 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003283}
3284
Alex Elder36be9a72013-01-19 00:30:28 -06003285/*
Alex Elderf40eb342013-04-25 15:09:42 -05003286 * Synchronous osd object method call. Returns the number of bytes
3287 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003288 */
3289static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003290 struct ceph_object_id *oid,
3291 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003292 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003293 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003294 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003295 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003296 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003297{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003298 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3299 struct page *req_page = NULL;
3300 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003301 int ret;
3302
3303 /*
Alex Elder6010a452013-04-05 01:27:11 -05003304 * Method calls are ultimately read operations. The result
3305 * should placed into the inbound buffer provided. They
3306 * also supply outbound data--parameters for the object
3307 * method. Currently if this is present it will be a
3308 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003309 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003310 if (outbound) {
3311 if (outbound_size > PAGE_SIZE)
3312 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003313
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003314 req_page = alloc_page(GFP_KERNEL);
3315 if (!req_page)
3316 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003317
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003318 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003319 }
Alex Elder430c28c2013-04-03 21:32:51 -05003320
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003321 reply_page = alloc_page(GFP_KERNEL);
3322 if (!reply_page) {
3323 if (req_page)
3324 __free_page(req_page);
3325 return -ENOMEM;
3326 }
Alex Elder36be9a72013-01-19 00:30:28 -06003327
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003328 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3329 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3330 reply_page, &inbound_size);
3331 if (!ret) {
3332 memcpy(inbound, page_address(reply_page), inbound_size);
3333 ret = inbound_size;
3334 }
Alex Elder57385b52013-04-21 12:14:45 -05003335
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003336 if (req_page)
3337 __free_page(req_page);
3338 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003339 return ret;
3340}
3341
Ilya Dryomoved95b212016-08-12 16:40:02 +02003342/*
3343 * lock_rwsem must be held for read
3344 */
3345static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
3346{
3347 DEFINE_WAIT(wait);
3348
3349 do {
3350 /*
3351 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3352 * and cancel_delayed_work() in wake_requests().
3353 */
3354 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3355 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3356 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3357 TASK_UNINTERRUPTIBLE);
3358 up_read(&rbd_dev->lock_rwsem);
3359 schedule();
3360 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003361 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
3362 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
3363
Ilya Dryomoved95b212016-08-12 16:40:02 +02003364 finish_wait(&rbd_dev->lock_waitq, &wait);
3365}
3366
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003367static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003368{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003369 struct request *rq = blk_mq_rq_from_pdu(work);
3370 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003371 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003372 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003373 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3374 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003375 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003376 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003377 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003378 int result;
3379
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003380 switch (req_op(rq)) {
3381 case REQ_OP_DISCARD:
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003382 case REQ_OP_WRITE_ZEROES:
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003383 op_type = OBJ_OP_DISCARD;
3384 break;
3385 case REQ_OP_WRITE:
3386 op_type = OBJ_OP_WRITE;
3387 break;
3388 case REQ_OP_READ:
3389 op_type = OBJ_OP_READ;
3390 break;
3391 default:
3392 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003393 result = -EIO;
3394 goto err;
3395 }
3396
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003397 /* Ignore/skip any zero-length requests */
3398
3399 if (!length) {
3400 dout("%s: zero-length request\n", __func__);
3401 result = 0;
3402 goto err_rq;
3403 }
3404
Ilya Dryomov9568c932017-10-12 12:35:19 +02003405 rbd_assert(op_type == OBJ_OP_READ ||
3406 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003407
3408 /*
3409 * Quit early if the mapped snapshot no longer exists. It's
3410 * still possible the snapshot will have disappeared by the
3411 * time our request arrives at the osd, but there's no sense in
3412 * sending it if we already know.
3413 */
3414 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3415 dout("request for non-existent snapshot");
3416 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3417 result = -ENXIO;
3418 goto err_rq;
3419 }
3420
3421 if (offset && length > U64_MAX - offset + 1) {
3422 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3423 length);
3424 result = -EINVAL;
3425 goto err_rq; /* Shouldn't happen */
3426 }
3427
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003428 blk_mq_start_request(rq);
3429
Josh Durgin4e752f02014-04-08 11:12:11 -07003430 down_read(&rbd_dev->header_rwsem);
3431 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003432 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003433 snapc = rbd_dev->header.snapc;
3434 ceph_get_snap_context(snapc);
3435 }
3436 up_read(&rbd_dev->header_rwsem);
3437
3438 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003439 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003440 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003441 result = -EIO;
3442 goto err_rq;
3443 }
3444
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003445 must_be_locked =
3446 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3447 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003448 if (must_be_locked) {
3449 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003450 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
Ilya Dryomove010dd02017-04-13 12:17:39 +02003451 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3452 if (rbd_dev->opts->exclusive) {
3453 rbd_warn(rbd_dev, "exclusive lock required");
3454 result = -EROFS;
3455 goto err_unlock;
3456 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003457 rbd_wait_state_locked(rbd_dev);
Ilya Dryomove010dd02017-04-13 12:17:39 +02003458 }
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003459 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3460 result = -EBLACKLISTED;
3461 goto err_unlock;
3462 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003463 }
3464
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003465 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07003466 snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003467 if (!img_request) {
3468 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003469 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003470 }
3471 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003472 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003473
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003474 if (op_type == OBJ_OP_DISCARD)
3475 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
3476 NULL);
Ilya Dryomov5359a172018-01-20 10:30:10 +01003477 else {
3478 struct ceph_bio_iter bio_it = { .bio = rq->bio,
3479 .iter = rq->bio->bi_iter };
3480
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003481 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
Ilya Dryomov5359a172018-01-20 10:30:10 +01003482 &bio_it);
3483 }
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003484 if (result)
3485 goto err_img_request;
3486
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01003487 rbd_img_request_submit(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003488 if (must_be_locked)
3489 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003490 return;
3491
3492err_img_request:
3493 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003494err_unlock:
3495 if (must_be_locked)
3496 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003497err_rq:
3498 if (result)
3499 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003500 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003501 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003502err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003503 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003504}
3505
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003506static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003507 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003508{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003509 struct request *rq = bd->rq;
3510 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003511
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003512 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003513 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003514}
3515
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003516static void rbd_free_disk(struct rbd_device *rbd_dev)
3517{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003518 blk_cleanup_queue(rbd_dev->disk->queue);
3519 blk_mq_free_tag_set(&rbd_dev->tag_set);
3520 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003521 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003522}
3523
Alex Elder788e2df2013-01-17 12:25:27 -06003524static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003525 struct ceph_object_id *oid,
3526 struct ceph_object_locator *oloc,
3527 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003528
3529{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003530 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3531 struct ceph_osd_request *req;
3532 struct page **pages;
3533 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003534 int ret;
3535
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003536 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3537 if (!req)
3538 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003539
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003540 ceph_oid_copy(&req->r_base_oid, oid);
3541 ceph_oloc_copy(&req->r_base_oloc, oloc);
3542 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003543
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003544 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06003545 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003546 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06003547
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003548 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3549 if (IS_ERR(pages)) {
3550 ret = PTR_ERR(pages);
3551 goto out_req;
3552 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003553
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003554 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3555 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3556 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003557
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003558 ceph_osdc_start_request(osdc, req, false);
3559 ret = ceph_osdc_wait_request(osdc, req);
3560 if (ret >= 0)
3561 ceph_copy_from_page_vector(pages, buf, 0, ret);
3562
3563out_req:
3564 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003565 return ret;
3566}
3567
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003568/*
Alex Elder662518b2013-05-06 09:51:29 -05003569 * Read the complete header for the given rbd device. On successful
3570 * return, the rbd_dev->header field will contain up-to-date
3571 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003572 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003573static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003574{
3575 struct rbd_image_header_ondisk *ondisk = NULL;
3576 u32 snap_count = 0;
3577 u64 names_size = 0;
3578 u32 want_count;
3579 int ret;
3580
3581 /*
3582 * The complete header will include an array of its 64-bit
3583 * snapshot ids, followed by the names of those snapshots as
3584 * a contiguous block of NUL-terminated strings. Note that
3585 * the number of snapshots could change by the time we read
3586 * it in, in which case we re-read it.
3587 */
3588 do {
3589 size_t size;
3590
3591 kfree(ondisk);
3592
3593 size = sizeof (*ondisk);
3594 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3595 size += names_size;
3596 ondisk = kmalloc(size, GFP_KERNEL);
3597 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003598 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003599
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003600 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3601 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05003602 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003603 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003604 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003605 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003606 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3607 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003608 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003609 }
3610 if (!rbd_dev_ondisk_valid(ondisk)) {
3611 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003612 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003613 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003614 }
3615
3616 names_size = le64_to_cpu(ondisk->snap_names_len);
3617 want_count = snap_count;
3618 snap_count = le32_to_cpu(ondisk->snap_count);
3619 } while (snap_count != want_count);
3620
Alex Elder662518b2013-05-06 09:51:29 -05003621 ret = rbd_header_from_disk(rbd_dev, ondisk);
3622out:
Alex Elder4156d992012-08-02 11:29:46 -05003623 kfree(ondisk);
3624
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003625 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003626}
3627
Alex Elder15228ed2013-05-01 12:43:03 -05003628/*
3629 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3630 * has disappeared from the (just updated) snapshot context.
3631 */
3632static void rbd_exists_validate(struct rbd_device *rbd_dev)
3633{
3634 u64 snap_id;
3635
3636 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3637 return;
3638
3639 snap_id = rbd_dev->spec->snap_id;
3640 if (snap_id == CEPH_NOSNAP)
3641 return;
3642
3643 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3644 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3645}
3646
Josh Durgin98752012013-08-29 17:26:31 -07003647static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3648{
3649 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07003650
3651 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02003652 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3653 * try to update its size. If REMOVING is set, updating size
3654 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07003655 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02003656 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3657 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07003658 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3659 dout("setting size to %llu sectors", (unsigned long long)size);
3660 set_capacity(rbd_dev->disk, size);
3661 revalidate_disk(rbd_dev->disk);
3662 }
3663}
3664
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003665static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003666{
Alex Eldere627db02013-05-06 07:40:30 -05003667 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003668 int ret;
3669
Alex Eldercfbf6372013-05-31 17:40:45 -05003670 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05003671 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04003672
3673 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003674 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003675 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05003676
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003677 /*
3678 * If there is a parent, see if it has disappeared due to the
3679 * mapped image getting flattened.
3680 */
3681 if (rbd_dev->parent) {
3682 ret = rbd_dev_v2_parent_info(rbd_dev);
3683 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003684 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003685 }
3686
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003687 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003688 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003689 } else {
3690 /* validate mapped snapshot's EXISTS flag */
3691 rbd_exists_validate(rbd_dev);
3692 }
Alex Elder15228ed2013-05-01 12:43:03 -05003693
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003694out:
Alex Eldercfbf6372013-05-31 17:40:45 -05003695 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003696 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07003697 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003698
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003699 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05003700}
3701
Christoph Hellwigd6296d32017-05-01 10:19:08 -06003702static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
3703 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003704{
3705 struct work_struct *work = blk_mq_rq_to_pdu(rq);
3706
3707 INIT_WORK(work, rbd_queue_workfn);
3708 return 0;
3709}
3710
Eric Biggersf363b082017-03-30 13:39:16 -07003711static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003712 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003713 .init_request = rbd_init_request,
3714};
3715
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003716static int rbd_init_disk(struct rbd_device *rbd_dev)
3717{
3718 struct gendisk *disk;
3719 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003720 u64 segment_size;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003721 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003722
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003723 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003724 disk = alloc_disk(single_major ?
3725 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3726 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003727 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003728 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003729
Alex Elderf0f8cef2012-01-29 13:57:44 -06003730 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003731 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003732 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003733 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003734 if (single_major)
3735 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003736 disk->fops = &rbd_bd_ops;
3737 disk->private_data = rbd_dev;
3738
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003739 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
3740 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003741 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003742 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003743 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003744 rbd_dev->tag_set.nr_hw_queues = 1;
3745 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
3746
3747 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
3748 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003749 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003750
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003751 q = blk_mq_init_queue(&rbd_dev->tag_set);
3752 if (IS_ERR(q)) {
3753 err = PTR_ERR(q);
3754 goto out_tag_set;
3755 }
3756
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03003757 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
3758 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06003759
Josh Durgin029bcbd2011-07-22 11:35:23 -07003760 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003761 segment_size = rbd_obj_bytes(&rbd_dev->header);
3762 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02003763 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01003764 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01003765 blk_queue_max_segment_size(q, UINT_MAX);
Alex Elder593a9e72012-02-07 12:03:37 -06003766 blk_queue_io_min(q, segment_size);
3767 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003768
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003769 /* enable the discard support */
3770 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
3771 q->limits.discard_granularity = segment_size;
Jens Axboe2bb4cd52015-07-14 08:15:12 -06003772 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003773 blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003774
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003775 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01003776 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003777
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003778 /*
3779 * disk_release() expects a queue ref from add_disk() and will
3780 * put it. Hold an extra ref until add_disk() is called.
3781 */
3782 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003783 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003784 q->queuedata = rbd_dev;
3785
3786 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003787
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003788 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003789out_tag_set:
3790 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003791out_disk:
3792 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003793 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003794}
3795
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003796/*
3797 sysfs
3798*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003799
Alex Elder593a9e72012-02-07 12:03:37 -06003800static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3801{
3802 return container_of(dev, struct rbd_device, dev);
3803}
3804
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003805static ssize_t rbd_size_show(struct device *dev,
3806 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003807{
Alex Elder593a9e72012-02-07 12:03:37 -06003808 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003809
Alex Elderfc71d832013-04-26 15:44:36 -05003810 return sprintf(buf, "%llu\n",
3811 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003812}
3813
Alex Elder34b13182012-07-13 20:35:12 -05003814/*
3815 * Note this shows the features for whatever's mapped, which is not
3816 * necessarily the base image.
3817 */
3818static ssize_t rbd_features_show(struct device *dev,
3819 struct device_attribute *attr, char *buf)
3820{
3821 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3822
3823 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003824 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05003825}
3826
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003827static ssize_t rbd_major_show(struct device *dev,
3828 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003829{
Alex Elder593a9e72012-02-07 12:03:37 -06003830 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003831
Alex Elderfc71d832013-04-26 15:44:36 -05003832 if (rbd_dev->major)
3833 return sprintf(buf, "%d\n", rbd_dev->major);
3834
3835 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003836}
Alex Elderfc71d832013-04-26 15:44:36 -05003837
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003838static ssize_t rbd_minor_show(struct device *dev,
3839 struct device_attribute *attr, char *buf)
3840{
3841 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3842
3843 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003844}
3845
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02003846static ssize_t rbd_client_addr_show(struct device *dev,
3847 struct device_attribute *attr, char *buf)
3848{
3849 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3850 struct ceph_entity_addr *client_addr =
3851 ceph_client_addr(rbd_dev->rbd_client->client);
3852
3853 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
3854 le32_to_cpu(client_addr->nonce));
3855}
3856
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003857static ssize_t rbd_client_id_show(struct device *dev,
3858 struct device_attribute *attr, char *buf)
3859{
Alex Elder593a9e72012-02-07 12:03:37 -06003860 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003861
Alex Elder1dbb4392012-01-24 10:08:37 -06003862 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02003863 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003864}
3865
Mike Christie267fb902016-08-18 18:38:43 +02003866static ssize_t rbd_cluster_fsid_show(struct device *dev,
3867 struct device_attribute *attr, char *buf)
3868{
3869 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3870
3871 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
3872}
3873
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02003874static ssize_t rbd_config_info_show(struct device *dev,
3875 struct device_attribute *attr, char *buf)
3876{
3877 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3878
3879 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003880}
3881
3882static ssize_t rbd_pool_show(struct device *dev,
3883 struct device_attribute *attr, char *buf)
3884{
Alex Elder593a9e72012-02-07 12:03:37 -06003885 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003886
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003887 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003888}
3889
Alex Elder9bb2f332012-07-12 10:46:35 -05003890static ssize_t rbd_pool_id_show(struct device *dev,
3891 struct device_attribute *attr, char *buf)
3892{
3893 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3894
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003895 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003896 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003897}
3898
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003899static ssize_t rbd_name_show(struct device *dev,
3900 struct device_attribute *attr, char *buf)
3901{
Alex Elder593a9e72012-02-07 12:03:37 -06003902 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003903
Alex Eldera92ffdf2012-10-30 19:40:33 -05003904 if (rbd_dev->spec->image_name)
3905 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3906
3907 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003908}
3909
Alex Elder589d30e2012-07-10 20:30:11 -05003910static ssize_t rbd_image_id_show(struct device *dev,
3911 struct device_attribute *attr, char *buf)
3912{
3913 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3914
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003915 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003916}
3917
Alex Elder34b13182012-07-13 20:35:12 -05003918/*
3919 * Shows the name of the currently-mapped snapshot (or
3920 * RBD_SNAP_HEAD_NAME for the base image).
3921 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003922static ssize_t rbd_snap_show(struct device *dev,
3923 struct device_attribute *attr,
3924 char *buf)
3925{
Alex Elder593a9e72012-02-07 12:03:37 -06003926 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003927
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003928 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003929}
3930
Mike Christie92a58672016-08-18 18:38:44 +02003931static ssize_t rbd_snap_id_show(struct device *dev,
3932 struct device_attribute *attr, char *buf)
3933{
3934 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3935
3936 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
3937}
3938
Alex Elder86b00e02012-10-25 23:34:42 -05003939/*
Ilya Dryomovff961282014-07-22 21:53:07 +04003940 * For a v2 image, shows the chain of parent images, separated by empty
3941 * lines. For v1 images or if there is no parent, shows "(no parent
3942 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05003943 */
3944static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04003945 struct device_attribute *attr,
3946 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05003947{
3948 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04003949 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05003950
Ilya Dryomovff961282014-07-22 21:53:07 +04003951 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05003952 return sprintf(buf, "(no parent image)\n");
3953
Ilya Dryomovff961282014-07-22 21:53:07 +04003954 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
3955 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05003956
Ilya Dryomovff961282014-07-22 21:53:07 +04003957 count += sprintf(&buf[count], "%s"
3958 "pool_id %llu\npool_name %s\n"
3959 "image_id %s\nimage_name %s\n"
3960 "snap_id %llu\nsnap_name %s\n"
3961 "overlap %llu\n",
3962 !count ? "" : "\n", /* first? */
3963 spec->pool_id, spec->pool_name,
3964 spec->image_id, spec->image_name ?: "(unknown)",
3965 spec->snap_id, spec->snap_name,
3966 rbd_dev->parent_overlap);
3967 }
Alex Elder86b00e02012-10-25 23:34:42 -05003968
Ilya Dryomovff961282014-07-22 21:53:07 +04003969 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05003970}
3971
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003972static ssize_t rbd_image_refresh(struct device *dev,
3973 struct device_attribute *attr,
3974 const char *buf,
3975 size_t size)
3976{
Alex Elder593a9e72012-02-07 12:03:37 -06003977 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003978 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003979
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003980 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05003981 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003982 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05003983
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003984 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003985}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003986
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003987static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003988static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003989static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003990static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02003991static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003992static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
Mike Christie267fb902016-08-18 18:38:43 +02003993static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02003994static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003995static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003996static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003997static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003998static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003999static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4000static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Mike Christie92a58672016-08-18 18:38:44 +02004001static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05004002static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004003
4004static struct attribute *rbd_attrs[] = {
4005 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004006 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004007 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004008 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004009 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004010 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004011 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004012 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004013 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004014 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004015 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004016 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004017 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004018 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004019 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004020 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004021 NULL
4022};
4023
4024static struct attribute_group rbd_attr_group = {
4025 .attrs = rbd_attrs,
4026};
4027
4028static const struct attribute_group *rbd_attr_groups[] = {
4029 &rbd_attr_group,
4030 NULL
4031};
4032
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004033static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004034
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304035static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004036 .name = "rbd",
4037 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004038 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004039};
4040
Alex Elder8b8fb992012-10-26 17:25:24 -05004041static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4042{
4043 kref_get(&spec->kref);
4044
4045 return spec;
4046}
4047
4048static void rbd_spec_free(struct kref *kref);
4049static void rbd_spec_put(struct rbd_spec *spec)
4050{
4051 if (spec)
4052 kref_put(&spec->kref, rbd_spec_free);
4053}
4054
4055static struct rbd_spec *rbd_spec_alloc(void)
4056{
4057 struct rbd_spec *spec;
4058
4059 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4060 if (!spec)
4061 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004062
4063 spec->pool_id = CEPH_NOPOOL;
4064 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004065 kref_init(&spec->kref);
4066
Alex Elder8b8fb992012-10-26 17:25:24 -05004067 return spec;
4068}
4069
4070static void rbd_spec_free(struct kref *kref)
4071{
4072 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4073
4074 kfree(spec->pool_name);
4075 kfree(spec->image_id);
4076 kfree(spec->image_name);
4077 kfree(spec->snap_name);
4078 kfree(spec);
4079}
4080
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004081static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004082{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004083 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004084 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004085
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004086 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004087 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004088 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004089
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004090 rbd_put_client(rbd_dev->rbd_client);
4091 rbd_spec_put(rbd_dev->spec);
4092 kfree(rbd_dev->opts);
4093 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004094}
4095
4096static void rbd_dev_release(struct device *dev)
4097{
4098 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4099 bool need_put = !!rbd_dev->opts;
4100
4101 if (need_put) {
4102 destroy_workqueue(rbd_dev->task_wq);
4103 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4104 }
4105
4106 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004107
4108 /*
4109 * This is racy, but way better than putting module outside of
4110 * the release callback. The race window is pretty small, so
4111 * doing something similar to dm (dm-builtin.c) is overkill.
4112 */
4113 if (need_put)
4114 module_put(THIS_MODULE);
4115}
4116
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004117static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4118 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004119{
4120 struct rbd_device *rbd_dev;
4121
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004122 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004123 if (!rbd_dev)
4124 return NULL;
4125
4126 spin_lock_init(&rbd_dev->lock);
4127 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004128 init_rwsem(&rbd_dev->header_rwsem);
4129
Ilya Dryomov7e973322017-01-25 18:16:22 +01004130 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004131 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004132 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004133
Ilya Dryomov99d16942016-08-12 16:11:41 +02004134 mutex_init(&rbd_dev->watch_mutex);
4135 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4136 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4137
Ilya Dryomoved95b212016-08-12 16:40:02 +02004138 init_rwsem(&rbd_dev->lock_rwsem);
4139 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4140 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4141 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4142 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4143 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4144 init_waitqueue_head(&rbd_dev->lock_waitq);
4145
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004146 rbd_dev->dev.bus = &rbd_bus_type;
4147 rbd_dev->dev.type = &rbd_device_type;
4148 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004149 device_initialize(&rbd_dev->dev);
4150
Alex Elderc53d5892012-10-25 23:34:42 -05004151 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004152 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004153
Alex Elderc53d5892012-10-25 23:34:42 -05004154 return rbd_dev;
4155}
4156
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004157/*
4158 * Create a mapping rbd_dev.
4159 */
4160static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4161 struct rbd_spec *spec,
4162 struct rbd_options *opts)
4163{
4164 struct rbd_device *rbd_dev;
4165
4166 rbd_dev = __rbd_dev_create(rbdc, spec);
4167 if (!rbd_dev)
4168 return NULL;
4169
4170 rbd_dev->opts = opts;
4171
4172 /* get an id and fill in device name */
4173 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4174 minor_to_rbd_dev_id(1 << MINORBITS),
4175 GFP_KERNEL);
4176 if (rbd_dev->dev_id < 0)
4177 goto fail_rbd_dev;
4178
4179 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4180 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4181 rbd_dev->name);
4182 if (!rbd_dev->task_wq)
4183 goto fail_dev_id;
4184
4185 /* we have a ref from do_rbd_add() */
4186 __module_get(THIS_MODULE);
4187
4188 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4189 return rbd_dev;
4190
4191fail_dev_id:
4192 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4193fail_rbd_dev:
4194 rbd_dev_free(rbd_dev);
4195 return NULL;
4196}
4197
Alex Elderc53d5892012-10-25 23:34:42 -05004198static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4199{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004200 if (rbd_dev)
4201 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004202}
4203
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004204/*
Alex Elder9d475de2012-07-03 16:01:19 -05004205 * Get the size and object order for an image snapshot, or if
4206 * snap_id is CEPH_NOSNAP, gets this information for the base
4207 * image.
4208 */
4209static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4210 u8 *order, u64 *snap_size)
4211{
4212 __le64 snapid = cpu_to_le64(snap_id);
4213 int ret;
4214 struct {
4215 u8 order;
4216 __le64 size;
4217 } __attribute__ ((packed)) size_buf = { 0 };
4218
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004219 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4220 &rbd_dev->header_oloc, "get_size",
4221 &snapid, sizeof(snapid),
4222 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004223 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004224 if (ret < 0)
4225 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004226 if (ret < sizeof (size_buf))
4227 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004228
Josh Durginc3545572013-08-28 17:08:10 -07004229 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004230 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004231 dout(" order %u", (unsigned int)*order);
4232 }
Alex Elder9d475de2012-07-03 16:01:19 -05004233 *snap_size = le64_to_cpu(size_buf.size);
4234
Josh Durginc3545572013-08-28 17:08:10 -07004235 dout(" snap_id 0x%016llx snap_size = %llu\n",
4236 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004237 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004238
4239 return 0;
4240}
4241
4242static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4243{
4244 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4245 &rbd_dev->header.obj_order,
4246 &rbd_dev->header.image_size);
4247}
4248
Alex Elder1e130192012-07-03 16:01:19 -05004249static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4250{
4251 void *reply_buf;
4252 int ret;
4253 void *p;
4254
4255 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4256 if (!reply_buf)
4257 return -ENOMEM;
4258
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004259 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4260 &rbd_dev->header_oloc, "get_object_prefix",
4261 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004262 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004263 if (ret < 0)
4264 goto out;
4265
4266 p = reply_buf;
4267 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004268 p + ret, NULL, GFP_NOIO);
4269 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004270
4271 if (IS_ERR(rbd_dev->header.object_prefix)) {
4272 ret = PTR_ERR(rbd_dev->header.object_prefix);
4273 rbd_dev->header.object_prefix = NULL;
4274 } else {
4275 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4276 }
Alex Elder1e130192012-07-03 16:01:19 -05004277out:
4278 kfree(reply_buf);
4279
4280 return ret;
4281}
4282
Alex Elderb1b54022012-07-03 16:01:19 -05004283static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4284 u64 *snap_features)
4285{
4286 __le64 snapid = cpu_to_le64(snap_id);
4287 struct {
4288 __le64 features;
4289 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004290 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004291 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004292 int ret;
4293
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004294 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4295 &rbd_dev->header_oloc, "get_features",
4296 &snapid, sizeof(snapid),
4297 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004298 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004299 if (ret < 0)
4300 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004301 if (ret < sizeof (features_buf))
4302 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004303
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004304 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4305 if (unsup) {
4306 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4307 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004308 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004309 }
Alex Elderd8891402012-10-09 13:50:17 -07004310
Alex Elderb1b54022012-07-03 16:01:19 -05004311 *snap_features = le64_to_cpu(features_buf.features);
4312
4313 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004314 (unsigned long long)snap_id,
4315 (unsigned long long)*snap_features,
4316 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004317
4318 return 0;
4319}
4320
4321static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4322{
4323 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4324 &rbd_dev->header.features);
4325}
4326
Alex Elder86b00e02012-10-25 23:34:42 -05004327static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4328{
4329 struct rbd_spec *parent_spec;
4330 size_t size;
4331 void *reply_buf = NULL;
4332 __le64 snapid;
4333 void *p;
4334 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05004335 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004336 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004337 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004338 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05004339 int ret;
4340
4341 parent_spec = rbd_spec_alloc();
4342 if (!parent_spec)
4343 return -ENOMEM;
4344
4345 size = sizeof (__le64) + /* pool_id */
4346 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
4347 sizeof (__le64) + /* snap_id */
4348 sizeof (__le64); /* overlap */
4349 reply_buf = kmalloc(size, GFP_KERNEL);
4350 if (!reply_buf) {
4351 ret = -ENOMEM;
4352 goto out_err;
4353 }
4354
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04004355 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004356 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4357 &rbd_dev->header_oloc, "get_parent",
4358 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004359 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05004360 if (ret < 0)
4361 goto out_err;
4362
Alex Elder86b00e02012-10-25 23:34:42 -05004363 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004364 end = reply_buf + ret;
4365 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05004366 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05004367 if (pool_id == CEPH_NOPOOL) {
4368 /*
4369 * Either the parent never existed, or we have
4370 * record of it but the image got flattened so it no
4371 * longer has a parent. When the parent of a
4372 * layered image disappears we immediately set the
4373 * overlap to 0. The effect of this is that all new
4374 * requests will be treated as if the image had no
4375 * parent.
4376 */
4377 if (rbd_dev->parent_overlap) {
4378 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004379 rbd_dev_parent_put(rbd_dev);
4380 pr_info("%s: clone image has been flattened\n",
4381 rbd_dev->disk->disk_name);
4382 }
4383
Alex Elder86b00e02012-10-25 23:34:42 -05004384 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004385 }
Alex Elder86b00e02012-10-25 23:34:42 -05004386
Alex Elder0903e872012-11-14 12:25:19 -06004387 /* The ceph file layout needs to fit pool id in 32 bits */
4388
4389 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05004390 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004391 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05004392 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004393 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004394 }
Alex Elder0903e872012-11-14 12:25:19 -06004395
Alex Elder979ed482012-11-01 08:39:26 -05004396 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05004397 if (IS_ERR(image_id)) {
4398 ret = PTR_ERR(image_id);
4399 goto out_err;
4400 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004401 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05004402 ceph_decode_64_safe(&p, end, overlap, out_err);
4403
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004404 /*
4405 * The parent won't change (except when the clone is
4406 * flattened, already handled that). So we only need to
4407 * record the parent spec we have not already done so.
4408 */
4409 if (!rbd_dev->parent_spec) {
4410 parent_spec->pool_id = pool_id;
4411 parent_spec->image_id = image_id;
4412 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05004413 rbd_dev->parent_spec = parent_spec;
4414 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04004415 } else {
4416 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004417 }
4418
4419 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004420 * We always update the parent overlap. If it's zero we issue
4421 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004422 */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004423 if (!overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004424 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004425 /* refresh, careful to warn just once */
4426 if (rbd_dev->parent_overlap)
4427 rbd_warn(rbd_dev,
4428 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004429 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004430 /* initial probe */
4431 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004432 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004433 }
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004434 rbd_dev->parent_overlap = overlap;
4435
Alex Elder86b00e02012-10-25 23:34:42 -05004436out:
4437 ret = 0;
4438out_err:
4439 kfree(reply_buf);
4440 rbd_spec_put(parent_spec);
4441
4442 return ret;
4443}
4444
Alex Eldercc070d52013-04-21 12:14:45 -05004445static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4446{
4447 struct {
4448 __le64 stripe_unit;
4449 __le64 stripe_count;
4450 } __attribute__ ((packed)) striping_info_buf = { 0 };
4451 size_t size = sizeof (striping_info_buf);
4452 void *p;
4453 u64 obj_size;
4454 u64 stripe_unit;
4455 u64 stripe_count;
4456 int ret;
4457
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004458 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4459 &rbd_dev->header_oloc, "get_stripe_unit_count",
4460 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004461 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4462 if (ret < 0)
4463 return ret;
4464 if (ret < size)
4465 return -ERANGE;
4466
4467 /*
4468 * We don't actually support the "fancy striping" feature
4469 * (STRIPINGV2) yet, but if the striping sizes are the
4470 * defaults the behavior is the same as before. So find
4471 * out, and only fail if the image has non-default values.
4472 */
4473 ret = -EINVAL;
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01004474 obj_size = rbd_obj_bytes(&rbd_dev->header);
Alex Eldercc070d52013-04-21 12:14:45 -05004475 p = &striping_info_buf;
4476 stripe_unit = ceph_decode_64(&p);
4477 if (stripe_unit != obj_size) {
4478 rbd_warn(rbd_dev, "unsupported stripe unit "
4479 "(got %llu want %llu)",
4480 stripe_unit, obj_size);
4481 return -EINVAL;
4482 }
4483 stripe_count = ceph_decode_64(&p);
4484 if (stripe_count != 1) {
4485 rbd_warn(rbd_dev, "unsupported stripe count "
4486 "(got %llu want 1)", stripe_count);
4487 return -EINVAL;
4488 }
Alex Elder500d0c02013-04-26 09:43:47 -05004489 rbd_dev->header.stripe_unit = stripe_unit;
4490 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05004491
4492 return 0;
4493}
4494
Ilya Dryomov7e973322017-01-25 18:16:22 +01004495static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4496{
4497 __le64 data_pool_id;
4498 int ret;
4499
4500 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4501 &rbd_dev->header_oloc, "get_data_pool",
4502 NULL, 0, &data_pool_id, sizeof(data_pool_id));
4503 if (ret < 0)
4504 return ret;
4505 if (ret < sizeof(data_pool_id))
4506 return -EBADMSG;
4507
4508 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4509 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4510 return 0;
4511}
4512
Alex Elder9e15b772012-10-30 19:40:33 -05004513static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4514{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004515 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05004516 size_t image_id_size;
4517 char *image_id;
4518 void *p;
4519 void *end;
4520 size_t size;
4521 void *reply_buf = NULL;
4522 size_t len = 0;
4523 char *image_name = NULL;
4524 int ret;
4525
4526 rbd_assert(!rbd_dev->spec->image_name);
4527
Alex Elder69e7a022012-11-01 08:39:26 -05004528 len = strlen(rbd_dev->spec->image_id);
4529 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004530 image_id = kmalloc(image_id_size, GFP_KERNEL);
4531 if (!image_id)
4532 return NULL;
4533
4534 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004535 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004536 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004537
4538 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4539 reply_buf = kmalloc(size, GFP_KERNEL);
4540 if (!reply_buf)
4541 goto out;
4542
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004543 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4544 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
4545 "dir_get_name", image_id, image_id_size,
4546 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05004547 if (ret < 0)
4548 goto out;
4549 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004550 end = reply_buf + ret;
4551
Alex Elder9e15b772012-10-30 19:40:33 -05004552 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4553 if (IS_ERR(image_name))
4554 image_name = NULL;
4555 else
4556 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4557out:
4558 kfree(reply_buf);
4559 kfree(image_id);
4560
4561 return image_name;
4562}
4563
Alex Elder2ad3d712013-04-30 00:44:33 -05004564static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4565{
4566 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4567 const char *snap_name;
4568 u32 which = 0;
4569
4570 /* Skip over names until we find the one we are looking for */
4571
4572 snap_name = rbd_dev->header.snap_names;
4573 while (which < snapc->num_snaps) {
4574 if (!strcmp(name, snap_name))
4575 return snapc->snaps[which];
4576 snap_name += strlen(snap_name) + 1;
4577 which++;
4578 }
4579 return CEPH_NOSNAP;
4580}
4581
4582static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4583{
4584 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4585 u32 which;
4586 bool found = false;
4587 u64 snap_id;
4588
4589 for (which = 0; !found && which < snapc->num_snaps; which++) {
4590 const char *snap_name;
4591
4592 snap_id = snapc->snaps[which];
4593 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07004594 if (IS_ERR(snap_name)) {
4595 /* ignore no-longer existing snapshots */
4596 if (PTR_ERR(snap_name) == -ENOENT)
4597 continue;
4598 else
4599 break;
4600 }
Alex Elder2ad3d712013-04-30 00:44:33 -05004601 found = !strcmp(name, snap_name);
4602 kfree(snap_name);
4603 }
4604 return found ? snap_id : CEPH_NOSNAP;
4605}
4606
4607/*
4608 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4609 * no snapshot by that name is found, or if an error occurs.
4610 */
4611static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4612{
4613 if (rbd_dev->image_format == 1)
4614 return rbd_v1_snap_id_by_name(rbd_dev, name);
4615
4616 return rbd_v2_snap_id_by_name(rbd_dev, name);
4617}
4618
Alex Elder9e15b772012-10-30 19:40:33 -05004619/*
Ilya Dryomov04077592014-07-23 17:11:20 +04004620 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05004621 */
Ilya Dryomov04077592014-07-23 17:11:20 +04004622static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
4623{
4624 struct rbd_spec *spec = rbd_dev->spec;
4625
4626 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
4627 rbd_assert(spec->image_id && spec->image_name);
4628 rbd_assert(spec->snap_name);
4629
4630 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4631 u64 snap_id;
4632
4633 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4634 if (snap_id == CEPH_NOSNAP)
4635 return -ENOENT;
4636
4637 spec->snap_id = snap_id;
4638 } else {
4639 spec->snap_id = CEPH_NOSNAP;
4640 }
4641
4642 return 0;
4643}
4644
4645/*
4646 * A parent image will have all ids but none of the names.
4647 *
4648 * All names in an rbd spec are dynamically allocated. It's OK if we
4649 * can't figure out the name for an image id.
4650 */
4651static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05004652{
Alex Elder2e9f7f12013-04-26 09:43:48 -05004653 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4654 struct rbd_spec *spec = rbd_dev->spec;
4655 const char *pool_name;
4656 const char *image_name;
4657 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004658 int ret;
4659
Ilya Dryomov04077592014-07-23 17:11:20 +04004660 rbd_assert(spec->pool_id != CEPH_NOPOOL);
4661 rbd_assert(spec->image_id);
4662 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05004663
Alex Elder2e9f7f12013-04-26 09:43:48 -05004664 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05004665
Alex Elder2e9f7f12013-04-26 09:43:48 -05004666 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4667 if (!pool_name) {
4668 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05004669 return -EIO;
4670 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05004671 pool_name = kstrdup(pool_name, GFP_KERNEL);
4672 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05004673 return -ENOMEM;
4674
4675 /* Fetch the image name; tolerate failure here */
4676
Alex Elder2e9f7f12013-04-26 09:43:48 -05004677 image_name = rbd_dev_image_name(rbd_dev);
4678 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05004679 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05004680
Ilya Dryomov04077592014-07-23 17:11:20 +04004681 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05004682
Alex Elder2e9f7f12013-04-26 09:43:48 -05004683 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07004684 if (IS_ERR(snap_name)) {
4685 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004686 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05004687 }
4688
4689 spec->pool_name = pool_name;
4690 spec->image_name = image_name;
4691 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004692
4693 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04004694
Alex Elder9e15b772012-10-30 19:40:33 -05004695out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05004696 kfree(image_name);
4697 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004698 return ret;
4699}
4700
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004701static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05004702{
4703 size_t size;
4704 int ret;
4705 void *reply_buf;
4706 void *p;
4707 void *end;
4708 u64 seq;
4709 u32 snap_count;
4710 struct ceph_snap_context *snapc;
4711 u32 i;
4712
4713 /*
4714 * We'll need room for the seq value (maximum snapshot id),
4715 * snapshot count, and array of that many snapshot ids.
4716 * For now we have a fixed upper limit on the number we're
4717 * prepared to receive.
4718 */
4719 size = sizeof (__le64) + sizeof (__le32) +
4720 RBD_MAX_SNAP_COUNT * sizeof (__le64);
4721 reply_buf = kzalloc(size, GFP_KERNEL);
4722 if (!reply_buf)
4723 return -ENOMEM;
4724
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004725 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4726 &rbd_dev->header_oloc, "get_snapcontext",
4727 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004728 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05004729 if (ret < 0)
4730 goto out;
4731
Alex Elder35d489f2012-07-03 16:01:19 -05004732 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004733 end = reply_buf + ret;
4734 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05004735 ceph_decode_64_safe(&p, end, seq, out);
4736 ceph_decode_32_safe(&p, end, snap_count, out);
4737
4738 /*
4739 * Make sure the reported number of snapshot ids wouldn't go
4740 * beyond the end of our buffer. But before checking that,
4741 * make sure the computed size of the snapshot context we
4742 * allocate is representable in a size_t.
4743 */
4744 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4745 / sizeof (u64)) {
4746 ret = -EINVAL;
4747 goto out;
4748 }
4749 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4750 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05004751 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05004752
Alex Elder812164f82013-04-30 00:44:32 -05004753 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05004754 if (!snapc) {
4755 ret = -ENOMEM;
4756 goto out;
4757 }
Alex Elder35d489f2012-07-03 16:01:19 -05004758 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05004759 for (i = 0; i < snap_count; i++)
4760 snapc->snaps[i] = ceph_decode_64(&p);
4761
Alex Elder49ece552013-05-06 08:37:00 -05004762 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05004763 rbd_dev->header.snapc = snapc;
4764
4765 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05004766 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05004767out:
4768 kfree(reply_buf);
4769
Alex Elder57385b52013-04-21 12:14:45 -05004770 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05004771}
4772
Alex Elder54cac612013-04-30 00:44:33 -05004773static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4774 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004775{
4776 size_t size;
4777 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05004778 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004779 int ret;
4780 void *p;
4781 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004782 char *snap_name;
4783
4784 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4785 reply_buf = kmalloc(size, GFP_KERNEL);
4786 if (!reply_buf)
4787 return ERR_PTR(-ENOMEM);
4788
Alex Elder54cac612013-04-30 00:44:33 -05004789 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004790 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4791 &rbd_dev->header_oloc, "get_snapshot_name",
4792 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004793 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05004794 if (ret < 0) {
4795 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004796 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05004797 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004798
4799 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004800 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05004801 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05004802 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004803 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004804
Alex Elderf40eb342013-04-25 15:09:42 -05004805 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05004806 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004807out:
4808 kfree(reply_buf);
4809
Alex Elderf40eb342013-04-25 15:09:42 -05004810 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004811}
4812
Alex Elder2df3fac2013-05-06 09:51:30 -05004813static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05004814{
Alex Elder2df3fac2013-05-06 09:51:30 -05004815 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05004816 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05004817
Josh Durgin1617e402013-06-12 14:43:10 -07004818 ret = rbd_dev_v2_image_size(rbd_dev);
4819 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05004820 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07004821
Alex Elder2df3fac2013-05-06 09:51:30 -05004822 if (first_time) {
4823 ret = rbd_dev_v2_header_onetime(rbd_dev);
4824 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05004825 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05004826 }
4827
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004828 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03004829 if (ret && first_time) {
4830 kfree(rbd_dev->header.object_prefix);
4831 rbd_dev->header.object_prefix = NULL;
4832 }
Alex Elder117973f2012-08-31 17:29:55 -05004833
4834 return ret;
4835}
4836
Ilya Dryomova720ae02014-07-23 17:11:19 +04004837static int rbd_dev_header_info(struct rbd_device *rbd_dev)
4838{
4839 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4840
4841 if (rbd_dev->image_format == 1)
4842 return rbd_dev_v1_header_info(rbd_dev);
4843
4844 return rbd_dev_v2_header_info(rbd_dev);
4845}
4846
Alex Elder1ddbe942012-01-29 13:57:44 -06004847/*
Alex Eldere28fff262012-02-02 08:13:30 -06004848 * Skips over white space at *buf, and updates *buf to point to the
4849 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004850 * the token (string of non-white space characters) found. Note
4851 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004852 */
4853static inline size_t next_token(const char **buf)
4854{
4855 /*
4856 * These are the characters that produce nonzero for
4857 * isspace() in the "C" and "POSIX" locales.
4858 */
4859 const char *spaces = " \f\n\r\t\v";
4860
4861 *buf += strspn(*buf, spaces); /* Find start of token */
4862
4863 return strcspn(*buf, spaces); /* Return token length */
4864}
4865
4866/*
Alex Elderea3352f2012-07-09 21:04:23 -05004867 * Finds the next token in *buf, dynamically allocates a buffer big
4868 * enough to hold a copy of it, and copies the token into the new
4869 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4870 * that a duplicate buffer is created even for a zero-length token.
4871 *
4872 * Returns a pointer to the newly-allocated duplicate, or a null
4873 * pointer if memory for the duplicate was not available. If
4874 * the lenp argument is a non-null pointer, the length of the token
4875 * (not including the '\0') is returned in *lenp.
4876 *
4877 * If successful, the *buf pointer will be updated to point beyond
4878 * the end of the found token.
4879 *
4880 * Note: uses GFP_KERNEL for allocation.
4881 */
4882static inline char *dup_token(const char **buf, size_t *lenp)
4883{
4884 char *dup;
4885 size_t len;
4886
4887 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004888 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004889 if (!dup)
4890 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004891 *(dup + len) = '\0';
4892 *buf += len;
4893
4894 if (lenp)
4895 *lenp = len;
4896
4897 return dup;
4898}
4899
4900/*
Alex Elder859c31d2012-10-25 23:34:42 -05004901 * Parse the options provided for an "rbd add" (i.e., rbd image
4902 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4903 * and the data written is passed here via a NUL-terminated buffer.
4904 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004905 *
Alex Elder859c31d2012-10-25 23:34:42 -05004906 * The information extracted from these options is recorded in
4907 * the other parameters which return dynamically-allocated
4908 * structures:
4909 * ceph_opts
4910 * The address of a pointer that will refer to a ceph options
4911 * structure. Caller must release the returned pointer using
4912 * ceph_destroy_options() when it is no longer needed.
4913 * rbd_opts
4914 * Address of an rbd options pointer. Fully initialized by
4915 * this function; caller must release with kfree().
4916 * spec
4917 * Address of an rbd image specification pointer. Fully
4918 * initialized by this function based on parsed options.
4919 * Caller must release with rbd_spec_put().
4920 *
4921 * The options passed take this form:
4922 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4923 * where:
4924 * <mon_addrs>
4925 * A comma-separated list of one or more monitor addresses.
4926 * A monitor address is an ip address, optionally followed
4927 * by a port number (separated by a colon).
4928 * I.e.: ip1[:port1][,ip2[:port2]...]
4929 * <options>
4930 * A comma-separated list of ceph and/or rbd options.
4931 * <pool_name>
4932 * The name of the rados pool containing the rbd image.
4933 * <image_name>
4934 * The name of the image in that pool to map.
4935 * <snap_id>
4936 * An optional snapshot id. If provided, the mapping will
4937 * present data from the image at the time that snapshot was
4938 * created. The image head is used if no snapshot id is
4939 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004940 */
Alex Elder859c31d2012-10-25 23:34:42 -05004941static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004942 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004943 struct rbd_options **opts,
4944 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004945{
Alex Elderd22f76e2012-07-12 10:46:35 -05004946 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004947 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004948 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05004949 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05004950 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004951 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004952 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004953 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004954 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004955
4956 /* The first four tokens are required */
4957
Alex Elder7ef32142012-02-02 08:13:30 -06004958 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004959 if (!len) {
4960 rbd_warn(NULL, "no monitor address(es) provided");
4961 return -EINVAL;
4962 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004963 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004964 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004965 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004966
Alex Elderdc79b112012-10-25 23:34:41 -05004967 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004968 options = dup_token(&buf, NULL);
4969 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004970 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004971 if (!*options) {
4972 rbd_warn(NULL, "no options provided");
4973 goto out_err;
4974 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004975
Alex Elder859c31d2012-10-25 23:34:42 -05004976 spec = rbd_spec_alloc();
4977 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004978 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004979
4980 spec->pool_name = dup_token(&buf, NULL);
4981 if (!spec->pool_name)
4982 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004983 if (!*spec->pool_name) {
4984 rbd_warn(NULL, "no pool name provided");
4985 goto out_err;
4986 }
Alex Eldere28fff262012-02-02 08:13:30 -06004987
Alex Elder69e7a022012-11-01 08:39:26 -05004988 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004989 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004990 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004991 if (!*spec->image_name) {
4992 rbd_warn(NULL, "no image name provided");
4993 goto out_err;
4994 }
Alex Eldere28fff262012-02-02 08:13:30 -06004995
Alex Elderf28e5652012-10-25 23:34:41 -05004996 /*
4997 * Snapshot name is optional; default is to use "-"
4998 * (indicating the head/no snapshot).
4999 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005000 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005001 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005002 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5003 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005004 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005005 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005006 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005007 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005008 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5009 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005010 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005011 *(snap_name + len) = '\0';
5012 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005013
Alex Elder0ddebc02012-10-25 23:34:41 -05005014 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005015
Alex Elder4e9afeb2012-10-25 23:34:41 -05005016 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
5017 if (!rbd_opts)
5018 goto out_mem;
5019
5020 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Ilya Dryomovb5584182015-06-23 16:21:19 +03005021 rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov80de1912016-09-20 14:23:17 +02005022 rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
Ilya Dryomove010dd02017-04-13 12:17:39 +02005023 rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005024
Alex Elder859c31d2012-10-25 23:34:42 -05005025 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05005026 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05005027 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05005028 if (IS_ERR(copts)) {
5029 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005030 goto out_err;
5031 }
Alex Elder859c31d2012-10-25 23:34:42 -05005032 kfree(options);
5033
5034 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005035 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05005036 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005037
Alex Elderdc79b112012-10-25 23:34:41 -05005038 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005039out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005040 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005041out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05005042 kfree(rbd_opts);
5043 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005044 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005045
Alex Elderdc79b112012-10-25 23:34:41 -05005046 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005047}
5048
Alex Elder589d30e2012-07-10 20:30:11 -05005049/*
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005050 * Return pool id (>= 0) or a negative error code.
5051 */
5052static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
5053{
Ilya Dryomova319bf52015-05-15 12:02:17 +03005054 struct ceph_options *opts = rbdc->client->options;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005055 u64 newest_epoch;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005056 int tries = 0;
5057 int ret;
5058
5059again:
5060 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
5061 if (ret == -ENOENT && tries++ < 1) {
Ilya Dryomovd0b19702016-04-28 16:07:27 +02005062 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
5063 &newest_epoch);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005064 if (ret < 0)
5065 return ret;
5066
5067 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
Ilya Dryomov7cca78c2016-04-28 16:07:28 +02005068 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005069 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
Ilya Dryomova319bf52015-05-15 12:02:17 +03005070 newest_epoch,
5071 opts->mount_timeout);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005072 goto again;
5073 } else {
5074 /* the osdmap we have is new enough */
5075 return -ENOENT;
5076 }
5077 }
5078
5079 return ret;
5080}
5081
Ilya Dryomove010dd02017-04-13 12:17:39 +02005082static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5083{
5084 down_write(&rbd_dev->lock_rwsem);
5085 if (__rbd_is_lock_owner(rbd_dev))
5086 rbd_unlock(rbd_dev);
5087 up_write(&rbd_dev->lock_rwsem);
5088}
5089
5090static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5091{
5092 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5093 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5094 return -EINVAL;
5095 }
5096
5097 /* FIXME: "rbd map --exclusive" should be in interruptible */
5098 down_read(&rbd_dev->lock_rwsem);
5099 rbd_wait_state_locked(rbd_dev);
5100 up_read(&rbd_dev->lock_rwsem);
5101 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
5102 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5103 return -EROFS;
5104 }
5105
5106 return 0;
5107}
5108
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005109/*
Alex Elder589d30e2012-07-10 20:30:11 -05005110 * An rbd format 2 image has a unique identifier, distinct from the
5111 * name given to it by the user. Internally, that identifier is
5112 * what's used to specify the names of objects related to the image.
5113 *
5114 * A special "rbd id" object is used to map an rbd image name to its
5115 * id. If that object doesn't exist, then there is no v2 rbd image
5116 * with the supplied name.
5117 *
5118 * This function will record the given rbd_dev's image_id field if
5119 * it can be determined, and in that case will return 0. If any
5120 * errors occur a negative errno will be returned and the rbd_dev's
5121 * image_id field will be unchanged (and should be NULL).
5122 */
5123static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5124{
5125 int ret;
5126 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005127 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005128 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005129 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005130
Alex Elder589d30e2012-07-10 20:30:11 -05005131 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005132 * When probing a parent image, the image id is already
5133 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005134 * need to fetch the image id again in this case. We
5135 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005136 */
Alex Elderc0fba362013-04-25 23:15:08 -05005137 if (rbd_dev->spec->image_id) {
5138 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5139
Alex Elder2c0d0a12012-10-30 19:40:33 -05005140 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005141 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005142
5143 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005144 * First, see if the format 2 image id file exists, and if
5145 * so, get the image's persistent id from it.
5146 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005147 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5148 rbd_dev->spec->image_name);
5149 if (ret)
5150 return ret;
5151
5152 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005153
5154 /* Response will be an encoded string, which includes a length */
5155
5156 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5157 response = kzalloc(size, GFP_NOIO);
5158 if (!response) {
5159 ret = -ENOMEM;
5160 goto out;
5161 }
5162
Alex Elderc0fba362013-04-25 23:15:08 -05005163 /* If it doesn't exist we'll assume it's a format 1 image */
5164
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005165 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5166 "get_id", NULL, 0,
5167 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005168 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005169 if (ret == -ENOENT) {
5170 image_id = kstrdup("", GFP_KERNEL);
5171 ret = image_id ? 0 : -ENOMEM;
5172 if (!ret)
5173 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005174 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005175 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005176
Alex Elderc0fba362013-04-25 23:15:08 -05005177 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005178 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005179 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005180 if (!ret)
5181 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005182 }
5183
5184 if (!ret) {
5185 rbd_dev->spec->image_id = image_id;
5186 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005187 }
5188out:
5189 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005190 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005191 return ret;
5192}
5193
Alex Elder3abef3b2013-05-13 20:35:37 -05005194/*
5195 * Undo whatever state changes are made by v1 or v2 header info
5196 * call.
5197 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005198static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5199{
5200 struct rbd_image_header *header;
5201
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005202 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005203
5204 /* Free dynamic fields from the header, then zero it out */
5205
5206 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005207 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005208 kfree(header->snap_sizes);
5209 kfree(header->snap_names);
5210 kfree(header->object_prefix);
5211 memset(header, 0, sizeof (*header));
5212}
5213
Alex Elder2df3fac2013-05-06 09:51:30 -05005214static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005215{
5216 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005217
Alex Elder1e130192012-07-03 16:01:19 -05005218 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005219 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005220 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005221
Alex Elder2df3fac2013-05-06 09:51:30 -05005222 /*
5223 * Get the and check features for the image. Currently the
5224 * features are assumed to never change.
5225 */
Alex Elderb1b54022012-07-03 16:01:19 -05005226 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005227 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005228 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005229
Alex Eldercc070d52013-04-21 12:14:45 -05005230 /* If the image supports fancy striping, get its parameters */
5231
5232 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5233 ret = rbd_dev_v2_striping_info(rbd_dev);
5234 if (ret < 0)
5235 goto out_err;
5236 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005237
Ilya Dryomov7e973322017-01-25 18:16:22 +01005238 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5239 ret = rbd_dev_v2_data_pool(rbd_dev);
5240 if (ret)
5241 goto out_err;
5242 }
5243
Ilya Dryomov263423f2017-01-25 18:16:22 +01005244 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005245 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005246
Alex Elder9d475de2012-07-03 16:01:19 -05005247out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005248 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005249 kfree(rbd_dev->header.object_prefix);
5250 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005251 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005252}
5253
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005254/*
5255 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5256 * rbd_dev_image_probe() recursion depth, which means it's also the
5257 * length of the already discovered part of the parent chain.
5258 */
5259static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005260{
Alex Elder2f82ee52012-10-30 19:40:33 -05005261 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005262 int ret;
5263
5264 if (!rbd_dev->parent_spec)
5265 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005266
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005267 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5268 pr_info("parent chain is too long (%d)\n", depth);
5269 ret = -EINVAL;
5270 goto out_err;
5271 }
5272
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005273 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005274 if (!parent) {
5275 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005276 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005277 }
5278
5279 /*
5280 * Images related by parent/child relationships always share
5281 * rbd_client and spec/parent_spec, so bump their refcounts.
5282 */
5283 __rbd_get_client(rbd_dev->rbd_client);
5284 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005285
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005286 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005287 if (ret < 0)
5288 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005289
Alex Elder124afba2013-04-26 15:44:36 -05005290 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005291 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005292 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005293
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005294out_err:
5295 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005296 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005297 return ret;
5298}
5299
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005300static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5301{
5302 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5303 rbd_dev_mapping_clear(rbd_dev);
5304 rbd_free_disk(rbd_dev);
5305 if (!single_major)
5306 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5307}
5308
Ilya Dryomov811c6682016-04-15 16:22:16 +02005309/*
5310 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5311 * upon return.
5312 */
Alex Elder200a6a82013-04-28 23:32:34 -05005313static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005314{
Alex Elder83a06262012-10-30 15:47:17 -05005315 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005316
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005317 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005318
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005319 if (!single_major) {
5320 ret = register_blkdev(0, rbd_dev->name);
5321 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005322 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005323
5324 rbd_dev->major = ret;
5325 rbd_dev->minor = 0;
5326 } else {
5327 rbd_dev->major = rbd_major;
5328 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5329 }
Alex Elder83a06262012-10-30 15:47:17 -05005330
5331 /* Set up the blkdev mapping. */
5332
5333 ret = rbd_init_disk(rbd_dev);
5334 if (ret)
5335 goto err_out_blkdev;
5336
Alex Elderf35a4de2013-05-06 09:51:29 -05005337 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005338 if (ret)
5339 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005340
Alex Elderf35a4de2013-05-06 09:51:29 -05005341 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005342 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005343
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005344 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005345 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005346 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005347
Alex Elder129b79d2013-04-26 15:44:36 -05005348 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005349 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005350 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005351
Alex Elderf35a4de2013-05-06 09:51:29 -05005352err_out_mapping:
5353 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005354err_out_disk:
5355 rbd_free_disk(rbd_dev);
5356err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005357 if (!single_major)
5358 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005359err_out_unlock:
5360 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005361 return ret;
5362}
5363
Alex Elder332bb122013-04-27 09:59:30 -05005364static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5365{
5366 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005367 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005368
5369 /* Record the header object name for this rbd image. */
5370
5371 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005372 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005373 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5374 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005375 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005376 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5377 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005378
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005379 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005380}
5381
Alex Elder200a6a82013-04-28 23:32:34 -05005382static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5383{
Alex Elder6fd48b32013-04-28 23:32:34 -05005384 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005385 if (rbd_dev->opts)
5386 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005387 rbd_dev->image_format = 0;
5388 kfree(rbd_dev->spec->image_id);
5389 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005390}
5391
Alex Eldera30b71b2012-07-10 20:30:11 -05005392/*
5393 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005394 * device. If this image is the one being mapped (i.e., not a
5395 * parent), initiate a watch on its header object before using that
5396 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005397 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005398static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005399{
5400 int ret;
5401
5402 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005403 * Get the id from the image id object. Unless there's an
5404 * error, rbd_dev->spec->image_id will be filled in with
5405 * a dynamically-allocated string, and rbd_dev->image_format
5406 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005407 */
5408 ret = rbd_dev_image_id(rbd_dev);
5409 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005410 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005411
Alex Elder332bb122013-04-27 09:59:30 -05005412 ret = rbd_dev_header_name(rbd_dev);
5413 if (ret)
5414 goto err_out_format;
5415
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005416 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005417 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005418 if (ret) {
5419 if (ret == -ENOENT)
5420 pr_info("image %s/%s does not exist\n",
5421 rbd_dev->spec->pool_name,
5422 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005423 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005424 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005425 }
Alex Elderb644de22013-04-27 09:59:31 -05005426
Ilya Dryomova720ae02014-07-23 17:11:19 +04005427 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005428 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005429 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005430
Ilya Dryomov04077592014-07-23 17:11:20 +04005431 /*
5432 * If this image is the one being mapped, we have pool name and
5433 * id, image name and id, and snap name - need to fill snap id.
5434 * Otherwise this is a parent image, identified by pool, image
5435 * and snap ids - need to fill in names for those ids.
5436 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005437 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005438 ret = rbd_spec_fill_snap_id(rbd_dev);
5439 else
5440 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005441 if (ret) {
5442 if (ret == -ENOENT)
5443 pr_info("snap %s/%s@%s does not exist\n",
5444 rbd_dev->spec->pool_name,
5445 rbd_dev->spec->image_name,
5446 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005447 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005448 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005449
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005450 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5451 ret = rbd_dev_v2_parent_info(rbd_dev);
5452 if (ret)
5453 goto err_out_probe;
5454
5455 /*
5456 * Need to warn users if this image is the one being
5457 * mapped and has a parent.
5458 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005459 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005460 rbd_warn(rbd_dev,
5461 "WARNING: kernel layering is EXPERIMENTAL!");
5462 }
5463
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005464 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005465 if (ret)
5466 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005467
Alex Elder30d60ba2013-05-06 09:51:30 -05005468 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005469 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005470 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005471
Alex Elder6fd48b32013-04-28 23:32:34 -05005472err_out_probe:
5473 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005474err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005475 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005476 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005477err_out_format:
5478 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005479 kfree(rbd_dev->spec->image_id);
5480 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005481 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005482}
5483
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005484static ssize_t do_rbd_add(struct bus_type *bus,
5485 const char *buf,
5486 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005487{
Alex Eldercb8627c2012-07-09 21:04:23 -05005488 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005489 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005490 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005491 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005492 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005493 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005494
5495 if (!try_module_get(THIS_MODULE))
5496 return -ENODEV;
5497
Alex Eldera725f65e2012-02-02 08:13:30 -06005498 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005499 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005500 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005501 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06005502
Alex Elder9d3997f2012-10-25 23:34:42 -05005503 rbdc = rbd_get_client(ceph_opts);
5504 if (IS_ERR(rbdc)) {
5505 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005506 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005507 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005508
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005509 /* pick the pool */
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005510 rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005511 if (rc < 0) {
5512 if (rc == -ENOENT)
5513 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005514 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005515 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05005516 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005517
Ilya Dryomovd1475432015-06-22 13:24:48 +03005518 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005519 if (!rbd_dev) {
5520 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05005521 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005522 }
Alex Elderc53d5892012-10-25 23:34:42 -05005523 rbdc = NULL; /* rbd_dev now owns this */
5524 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03005525 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005526
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005527 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5528 if (!rbd_dev->config_info) {
5529 rc = -ENOMEM;
5530 goto err_out_rbd_dev;
5531 }
5532
Ilya Dryomov811c6682016-04-15 16:22:16 +02005533 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005534 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005535 if (rc < 0) {
5536 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05005537 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005538 }
Alex Elder05fd6f62012-08-29 17:11:07 -05005539
Alex Elder7ce4eef2013-05-06 17:40:33 -05005540 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05005541 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02005542 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05005543
Alex Elderb536f692013-04-28 23:32:34 -05005544 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005545 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005546 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05005547
Ilya Dryomove010dd02017-04-13 12:17:39 +02005548 if (rbd_dev->opts->exclusive) {
5549 rc = rbd_add_acquire_lock(rbd_dev);
5550 if (rc)
5551 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05005552 }
5553
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005554 /* Everything's ready. Announce the disk to the world. */
5555
5556 rc = device_add(&rbd_dev->dev);
5557 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02005558 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005559
5560 add_disk(rbd_dev->disk);
5561 /* see rbd_init_disk() */
5562 blk_put_queue(rbd_dev->disk->queue);
5563
5564 spin_lock(&rbd_dev_list_lock);
5565 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5566 spin_unlock(&rbd_dev_list_lock);
5567
5568 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5569 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5570 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005571 rc = count;
5572out:
5573 module_put(THIS_MODULE);
5574 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05005575
Ilya Dryomove010dd02017-04-13 12:17:39 +02005576err_out_image_lock:
5577 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005578err_out_device_setup:
5579 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005580err_out_image_probe:
5581 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005582err_out_rbd_dev:
5583 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005584err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005585 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005586err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05005587 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03005588 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005589 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005590}
5591
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005592static ssize_t rbd_add(struct bus_type *bus,
5593 const char *buf,
5594 size_t count)
5595{
5596 if (single_major)
5597 return -EINVAL;
5598
5599 return do_rbd_add(bus, buf, count);
5600}
5601
5602static ssize_t rbd_add_single_major(struct bus_type *bus,
5603 const char *buf,
5604 size_t count)
5605{
5606 return do_rbd_add(bus, buf, count);
5607}
5608
Alex Elder05a46af2013-04-26 15:44:36 -05005609static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5610{
Alex Elderad945fc2013-04-26 15:44:36 -05005611 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005612 struct rbd_device *first = rbd_dev;
5613 struct rbd_device *second = first->parent;
5614 struct rbd_device *third;
5615
5616 /*
5617 * Follow to the parent with no grandparent and
5618 * remove it.
5619 */
5620 while (second && (third = second->parent)) {
5621 first = second;
5622 second = third;
5623 }
Alex Elderad945fc2013-04-26 15:44:36 -05005624 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005625 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005626 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005627 first->parent = NULL;
5628 first->parent_overlap = 0;
5629
5630 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005631 rbd_spec_put(first->parent_spec);
5632 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005633 }
5634}
5635
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005636static ssize_t do_rbd_remove(struct bus_type *bus,
5637 const char *buf,
5638 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005639{
5640 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05005641 struct list_head *tmp;
5642 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02005643 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05005644 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02005645 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05005646 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005647
Mike Christie0276dca2016-08-18 18:38:45 +02005648 dev_id = -1;
5649 opt_buf[0] = '\0';
5650 sscanf(buf, "%d %5s", &dev_id, opt_buf);
5651 if (dev_id < 0) {
5652 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005653 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02005654 }
5655 if (opt_buf[0] != '\0') {
5656 if (!strcmp(opt_buf, "force")) {
5657 force = true;
5658 } else {
5659 pr_err("bad remove option at '%s'\n", opt_buf);
5660 return -EINVAL;
5661 }
5662 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005663
Alex Elder751cc0e2013-05-31 15:17:01 -05005664 ret = -ENOENT;
5665 spin_lock(&rbd_dev_list_lock);
5666 list_for_each(tmp, &rbd_dev_list) {
5667 rbd_dev = list_entry(tmp, struct rbd_device, node);
5668 if (rbd_dev->dev_id == dev_id) {
5669 ret = 0;
5670 break;
5671 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005672 }
Alex Elder751cc0e2013-05-31 15:17:01 -05005673 if (!ret) {
5674 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02005675 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05005676 ret = -EBUSY;
5677 else
Alex Elder82a442d2013-05-31 17:40:44 -05005678 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5679 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05005680 spin_unlock_irq(&rbd_dev->lock);
5681 }
5682 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05005683 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005684 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05005685
Mike Christie0276dca2016-08-18 18:38:45 +02005686 if (force) {
5687 /*
5688 * Prevent new IO from being queued and wait for existing
5689 * IO to complete/fail.
5690 */
5691 blk_mq_freeze_queue(rbd_dev->disk->queue);
5692 blk_set_queue_dying(rbd_dev->disk->queue);
5693 }
5694
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005695 del_gendisk(rbd_dev->disk);
5696 spin_lock(&rbd_dev_list_lock);
5697 list_del_init(&rbd_dev->node);
5698 spin_unlock(&rbd_dev_list_lock);
5699 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02005700
Ilya Dryomove010dd02017-04-13 12:17:39 +02005701 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005702 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005703 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005704 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005705 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005706}
5707
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005708static ssize_t rbd_remove(struct bus_type *bus,
5709 const char *buf,
5710 size_t count)
5711{
5712 if (single_major)
5713 return -EINVAL;
5714
5715 return do_rbd_remove(bus, buf, count);
5716}
5717
5718static ssize_t rbd_remove_single_major(struct bus_type *bus,
5719 const char *buf,
5720 size_t count)
5721{
5722 return do_rbd_remove(bus, buf, count);
5723}
5724
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005725/*
5726 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005727 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005728 */
5729static int rbd_sysfs_init(void)
5730{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005731 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005732
Alex Elderfed4c142012-02-07 12:03:36 -06005733 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005734 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005735 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005736
Alex Elderfed4c142012-02-07 12:03:36 -06005737 ret = bus_register(&rbd_bus_type);
5738 if (ret < 0)
5739 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005740
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005741 return ret;
5742}
5743
5744static void rbd_sysfs_cleanup(void)
5745{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005746 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005747 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005748}
5749
Alex Elder1c2a9df2013-05-01 12:43:03 -05005750static int rbd_slab_init(void)
5751{
5752 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005753 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05005754 if (!rbd_img_request_cache)
5755 return -ENOMEM;
5756
5757 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005758 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05005759 if (!rbd_obj_request_cache)
5760 goto out_err;
5761
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005762 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005763
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005764out_err:
Alex Elder868311b2013-05-01 12:43:03 -05005765 kmem_cache_destroy(rbd_img_request_cache);
5766 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005767 return -ENOMEM;
5768}
5769
5770static void rbd_slab_exit(void)
5771{
Alex Elder868311b2013-05-01 12:43:03 -05005772 rbd_assert(rbd_obj_request_cache);
5773 kmem_cache_destroy(rbd_obj_request_cache);
5774 rbd_obj_request_cache = NULL;
5775
Alex Elder1c2a9df2013-05-01 12:43:03 -05005776 rbd_assert(rbd_img_request_cache);
5777 kmem_cache_destroy(rbd_img_request_cache);
5778 rbd_img_request_cache = NULL;
5779}
5780
Alex Eldercc344fa2013-02-19 12:25:56 -06005781static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005782{
5783 int rc;
5784
Alex Elder1e32d342013-01-30 11:13:33 -06005785 if (!libceph_compatible(NULL)) {
5786 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06005787 return -EINVAL;
5788 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005789
Alex Elder1c2a9df2013-05-01 12:43:03 -05005790 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005791 if (rc)
5792 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005793
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005794 /*
5795 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03005796 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005797 */
5798 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
5799 if (!rbd_wq) {
5800 rc = -ENOMEM;
5801 goto err_out_slab;
5802 }
5803
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005804 if (single_major) {
5805 rbd_major = register_blkdev(0, RBD_DRV_NAME);
5806 if (rbd_major < 0) {
5807 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005808 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005809 }
5810 }
5811
Alex Elder1c2a9df2013-05-01 12:43:03 -05005812 rc = rbd_sysfs_init();
5813 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005814 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005815
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005816 if (single_major)
5817 pr_info("loaded (major %d)\n", rbd_major);
5818 else
5819 pr_info("loaded\n");
5820
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005821 return 0;
5822
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005823err_out_blkdev:
5824 if (single_major)
5825 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005826err_out_wq:
5827 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005828err_out_slab:
5829 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05005830 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005831}
5832
Alex Eldercc344fa2013-02-19 12:25:56 -06005833static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005834{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04005835 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005836 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005837 if (single_major)
5838 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005839 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05005840 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005841}
5842
5843module_init(rbd_init);
5844module_exit(rbd_exit);
5845
Alex Elderd552c612013-05-31 20:13:09 -05005846MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005847MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5848MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005849/* following authorship retained from original osdblk.c */
5850MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5851
Ilya Dryomov90da2582013-12-13 15:28:56 +02005852MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005853MODULE_LICENSE("GPL");