blob: d3ad1b8c133e6f39190a5effdf2c8d984fa6eaae [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
Mark Rutlandbfc18e32018-06-21 13:13:04 +010064 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
Alex Eldera2acd002013-05-08 22:50:04 -050065 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
118#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100119#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100120
Ilya Dryomoved95b212016-08-12 16:40:02 +0200121#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
122 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100123 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100124 RBD_FEATURE_DATA_POOL | \
125 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700126
127/* Features supported by this (client software) implementation. */
128
Alex Elder770eba62012-10-25 23:34:40 -0500129#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700130
Alex Elder81a89792012-02-02 08:13:30 -0600131/*
132 * An RBD device name will be "rbd#", where the "rbd" comes from
133 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600134 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700135#define DEV_NAME_LEN 32
136
137/*
138 * block device image metadata (in-memory version)
139 */
140struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500141 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500142 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500144 u64 stripe_unit;
145 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100146 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500147 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700148
Alex Elderf84344f2012-08-31 17:29:51 -0500149 /* The remaining fields need to be updated occasionally */
150 u64 image_size;
151 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500152 char *snap_names; /* format 1 only */
153 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700154};
155
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500156/*
157 * An rbd image specification.
158 *
159 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500160 * identify an image. Each rbd_dev structure includes a pointer to
161 * an rbd_spec structure that encapsulates this identity.
162 *
163 * Each of the id's in an rbd_spec has an associated name. For a
164 * user-mapped image, the names are supplied and the id's associated
165 * with them are looked up. For a layered image, a parent image is
166 * defined by the tuple, and the names are looked up.
167 *
168 * An rbd_dev structure contains a parent_spec pointer which is
169 * non-null if the image it represents is a child in a layered
170 * image. This pointer will refer to the rbd_spec structure used
171 * by the parent rbd_dev for its own identity (i.e., the structure
172 * is shared between the parent and child).
173 *
174 * Since these structures are populated once, during the discovery
175 * phase of image construction, they are effectively immutable so
176 * we make no effort to synchronize access to them.
177 *
178 * Note that code herein does not assume the image name is known (it
179 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500180 */
181struct rbd_spec {
182 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500183 const char *pool_name;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200184 const char *pool_ns; /* NULL if default, never "" */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500185
Alex Elderecb4dc22013-04-26 09:43:47 -0500186 const char *image_id;
187 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500188
189 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500190 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500191
192 struct kref kref;
193};
194
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700195/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600196 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700197 */
198struct rbd_client {
199 struct ceph_client *client;
200 struct kref kref;
201 struct list_head node;
202};
203
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600205
Alex Elder9969ebc2013-01-18 12:31:10 -0600206enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100207 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100208 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100209 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100210 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600211};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800213enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100214 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800215 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800216 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800217};
218
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100219/*
220 * Writes go through the following state machine to deal with
221 * layering:
222 *
223 * need copyup
224 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
225 * | ^ |
226 * v \------------------------------/
227 * done
228 * ^
229 * |
230 * RBD_OBJ_WRITE_FLAT
231 *
232 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
233 * there is a parent or not.
234 */
235enum rbd_obj_write_state {
236 RBD_OBJ_WRITE_FLAT = 1,
237 RBD_OBJ_WRITE_GUARD,
238 RBD_OBJ_WRITE_COPYUP,
Alex Elder926f9b32013-02-11 12:33:24 -0600239};
240
Alex Elderbf0d5f502012-11-22 00:00:08 -0600241struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100242 struct ceph_object_extent ex;
Alex Elderc5b5ef62013-02-11 12:33:24 -0600243 union {
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100244 bool tried_parent; /* for reads */
245 enum rbd_obj_write_state write_state; /* for writes */
246 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600247
Ilya Dryomov51c35092018-01-29 14:04:08 +0100248 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100249 struct ceph_file_extent *img_extents;
250 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600251
Alex Elder788e2df2013-01-17 12:25:27 -0600252 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100253 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600254 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100255 struct ceph_bvec_iter bvec_pos;
256 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100257 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600258 };
259 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100260 struct bio_vec *copyup_bvecs;
261 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600262
263 struct ceph_osd_request *osd_req;
264
265 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800266 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600267
Alex Elderbf0d5f502012-11-22 00:00:08 -0600268 struct kref kref;
269};
270
Alex Elder0c425242013-02-08 09:55:49 -0600271enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600272 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600273 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600274};
275
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600277 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100278 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100279 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600280 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600281 union {
Alex Elder9849e982013-01-24 16:13:36 -0600282 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600283 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600284 };
285 union {
286 struct request *rq; /* block request */
287 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600288 };
Ilya Dryomov15961b42018-02-01 11:50:47 +0100289 spinlock_t completion_lock;
Alex Elder55f27e02013-04-10 12:34:25 -0500290 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600291 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600292
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100293 struct list_head object_extents; /* obj_req.ex structs */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600294 u32 obj_request_count;
Ilya Dryomov7114eda2018-02-01 11:50:47 +0100295 u32 pending_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600296
297 struct kref kref;
298};
299
300#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100301 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600302#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100303 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600304
Ilya Dryomov99d16942016-08-12 16:11:41 +0200305enum rbd_watch_state {
306 RBD_WATCH_STATE_UNREGISTERED,
307 RBD_WATCH_STATE_REGISTERED,
308 RBD_WATCH_STATE_ERROR,
309};
310
Ilya Dryomoved95b212016-08-12 16:40:02 +0200311enum rbd_lock_state {
312 RBD_LOCK_STATE_UNLOCKED,
313 RBD_LOCK_STATE_LOCKED,
314 RBD_LOCK_STATE_RELEASING,
315};
316
317/* WatchNotify::ClientId */
318struct rbd_client_id {
319 u64 gid;
320 u64 handle;
321};
322
Alex Elderf84344f2012-08-31 17:29:51 -0500323struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500324 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500325 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500326};
327
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328/*
329 * a single device
330 */
331struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500332 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333
334 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200335 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700337
Alex Eldera30b71b2012-07-10 20:30:11 -0500338 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 struct rbd_client *rbd_client;
340
341 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
342
Alex Elderb82d1672013-01-14 12:43:31 -0600343 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
345 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600346 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500347 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300348 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200349 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200351 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200352 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500353
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200354 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600355
Ilya Dryomov99d16942016-08-12 16:11:41 +0200356 struct mutex watch_mutex;
357 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200358 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200359 u64 watch_cookie;
360 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361
Ilya Dryomoved95b212016-08-12 16:40:02 +0200362 struct rw_semaphore lock_rwsem;
363 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200364 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200365 struct rbd_client_id owner_cid;
366 struct work_struct acquired_lock_work;
367 struct work_struct released_lock_work;
368 struct delayed_work lock_dwork;
369 struct work_struct unlock_work;
370 wait_queue_head_t lock_waitq;
371
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200372 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700373
Alex Elder86b00e02012-10-25 23:34:42 -0500374 struct rbd_spec *parent_spec;
375 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500376 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500377 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500378
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100379 /* Block layer tags. */
380 struct blk_mq_tag_set tag_set;
381
Josh Durginc6666012011-11-21 17:11:12 -0800382 /* protects updating the header */
383 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500384
385 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700386
387 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800388
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800389 /* sysfs related */
390 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600391 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800392};
393
Alex Elderb82d1672013-01-14 12:43:31 -0600394/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200395 * Flag bits for rbd_dev->flags:
396 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
397 * by rbd_dev->lock
398 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600399 */
Alex Elder6d292902013-01-14 12:43:31 -0600400enum rbd_dev_flags {
401 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600402 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200403 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600404};
405
Alex Eldercfbf6372013-05-31 17:40:45 -0500406static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600407
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700408static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600409static DEFINE_SPINLOCK(rbd_dev_list_lock);
410
Alex Elder432b8582012-01-29 13:57:44 -0600411static LIST_HEAD(rbd_client_list); /* clients */
412static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700413
Alex Elder78c2a442013-05-01 12:43:04 -0500414/* Slab caches for frequently-allocated structures */
415
Alex Elder1c2a9df2013-05-01 12:43:03 -0500416static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500417static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500418
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200419static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200420static DEFINE_IDA(rbd_dev_id_ida);
421
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400422static struct workqueue_struct *rbd_wq;
423
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200424/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100425 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200426 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100427static bool single_major = true;
Joe Perches5657a812018-05-24 13:38:59 -0600428module_param(single_major, bool, 0444);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100429MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200430
Alex Elderf0f8cef2012-01-29 13:57:44 -0600431static ssize_t rbd_add(struct bus_type *bus, const char *buf,
432 size_t count);
433static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
434 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200435static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
436 size_t count);
437static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
438 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200439static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600440
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200441static int rbd_dev_id_to_minor(int dev_id)
442{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200443 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200444}
445
446static int minor_to_rbd_dev_id(int minor)
447{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200448 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200449}
450
Ilya Dryomoved95b212016-08-12 16:40:02 +0200451static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
452{
453 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
454 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
455}
456
457static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
458{
459 bool is_lock_owner;
460
461 down_read(&rbd_dev->lock_rwsem);
462 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
463 up_read(&rbd_dev->lock_rwsem);
464 return is_lock_owner;
465}
466
Ilya Dryomov8767b292017-03-02 19:56:57 +0100467static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
468{
469 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
470}
471
Joe Perches5657a812018-05-24 13:38:59 -0600472static BUS_ATTR(add, 0200, NULL, rbd_add);
473static BUS_ATTR(remove, 0200, NULL, rbd_remove);
474static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
475static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
476static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700477
478static struct attribute *rbd_bus_attrs[] = {
479 &bus_attr_add.attr,
480 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200481 &bus_attr_add_single_major.attr,
482 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100483 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700484 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600485};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200486
487static umode_t rbd_bus_is_visible(struct kobject *kobj,
488 struct attribute *attr, int index)
489{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200490 if (!single_major &&
491 (attr == &bus_attr_add_single_major.attr ||
492 attr == &bus_attr_remove_single_major.attr))
493 return 0;
494
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200495 return attr->mode;
496}
497
498static const struct attribute_group rbd_bus_group = {
499 .attrs = rbd_bus_attrs,
500 .is_visible = rbd_bus_is_visible,
501};
502__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600503
504static struct bus_type rbd_bus_type = {
505 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700506 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600507};
508
509static void rbd_root_dev_release(struct device *dev)
510{
511}
512
513static struct device rbd_root_dev = {
514 .init_name = "rbd",
515 .release = rbd_root_dev_release,
516};
517
Alex Elder06ecc6c2012-11-01 10:17:15 -0500518static __printf(2, 3)
519void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
520{
521 struct va_format vaf;
522 va_list args;
523
524 va_start(args, fmt);
525 vaf.fmt = fmt;
526 vaf.va = &args;
527
528 if (!rbd_dev)
529 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
530 else if (rbd_dev->disk)
531 printk(KERN_WARNING "%s: %s: %pV\n",
532 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
533 else if (rbd_dev->spec && rbd_dev->spec->image_name)
534 printk(KERN_WARNING "%s: image %s: %pV\n",
535 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
536 else if (rbd_dev->spec && rbd_dev->spec->image_id)
537 printk(KERN_WARNING "%s: id %s: %pV\n",
538 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
539 else /* punt */
540 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
541 RBD_DRV_NAME, rbd_dev, &vaf);
542 va_end(args);
543}
544
Alex Elderaafb2302012-09-06 16:00:54 -0500545#ifdef RBD_DEBUG
546#define rbd_assert(expr) \
547 if (unlikely(!(expr))) { \
548 printk(KERN_ERR "\nAssertion failure in %s() " \
549 "at line %d:\n\n" \
550 "\trbd_assert(%s);\n\n", \
551 __func__, __LINE__, #expr); \
552 BUG(); \
553 }
554#else /* !RBD_DEBUG */
555# define rbd_assert(expr) ((void) 0)
556#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800557
Alex Elder05a46af2013-04-26 15:44:36 -0500558static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600559
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500560static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500561static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400562static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400563static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500564static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
565 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500566static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
567 u8 *order, u64 *snap_size);
568static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
569 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700570
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700571static int rbd_open(struct block_device *bdev, fmode_t mode)
572{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600573 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600574 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575
Alex Eldera14ea262013-02-05 13:23:12 -0600576 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600577 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
578 removing = true;
579 else
580 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600581 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600582 if (removing)
583 return -ENOENT;
584
Alex Elderc3e946c2012-11-16 09:29:16 -0600585 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700586
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587 return 0;
588}
589
Al Virodb2a1442013-05-05 21:52:57 -0400590static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800591{
592 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600593 unsigned long open_count_before;
594
Alex Eldera14ea262013-02-05 13:23:12 -0600595 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600596 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600597 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600598 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800599
Alex Elderc3e946c2012-11-16 09:29:16 -0600600 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800601}
602
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800603static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
604{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200605 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800606
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200607 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800608 return -EFAULT;
609
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200610 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800611 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
612 return -EROFS;
613
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200614 /* Let blkdev_roset() handle it */
615 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800616}
617
618static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
619 unsigned int cmd, unsigned long arg)
620{
621 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200622 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800623
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800624 switch (cmd) {
625 case BLKROSET:
626 ret = rbd_ioctl_set_ro(rbd_dev, arg);
627 break;
628 default:
629 ret = -ENOTTY;
630 }
631
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800632 return ret;
633}
634
635#ifdef CONFIG_COMPAT
636static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
637 unsigned int cmd, unsigned long arg)
638{
639 return rbd_ioctl(bdev, mode, cmd, arg);
640}
641#endif /* CONFIG_COMPAT */
642
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643static const struct block_device_operations rbd_bd_ops = {
644 .owner = THIS_MODULE,
645 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800646 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800647 .ioctl = rbd_ioctl,
648#ifdef CONFIG_COMPAT
649 .compat_ioctl = rbd_compat_ioctl,
650#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651};
652
653/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500654 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500655 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656 */
Alex Elderf8c38922012-08-10 13:12:07 -0700657static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658{
659 struct rbd_client *rbdc;
660 int ret = -ENOMEM;
661
Alex Elder37206ee2013-02-20 17:32:08 -0600662 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
664 if (!rbdc)
665 goto out_opt;
666
667 kref_init(&rbdc->kref);
668 INIT_LIST_HEAD(&rbdc->node);
669
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100670 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500672 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500673 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674
675 ret = ceph_open_session(rbdc->client);
676 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500677 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678
Alex Elder432b8582012-01-29 13:57:44 -0600679 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600681 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682
Alex Elder37206ee2013-02-20 17:32:08 -0600683 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600684
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500686out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500688out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689 kfree(rbdc);
690out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500691 if (ceph_opts)
692 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600693 dout("%s: error %d\n", __func__, ret);
694
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400695 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696}
697
Alex Elder2f82ee52012-10-30 19:40:33 -0500698static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
699{
700 kref_get(&rbdc->kref);
701
702 return rbdc;
703}
704
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700706 * Find a ceph client with specific addr and configuration. If
707 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700709static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700710{
711 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700712 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713
Alex Elder43ae4702012-07-03 16:01:18 -0500714 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700715 return NULL;
716
Alex Elder1f7ba332012-08-10 13:12:07 -0700717 spin_lock(&rbd_client_list_lock);
718 list_for_each_entry(client_node, &rbd_client_list, node) {
719 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500720 __rbd_get_client(client_node);
721
Alex Elder1f7ba332012-08-10 13:12:07 -0700722 found = true;
723 break;
724 }
725 }
726 spin_unlock(&rbd_client_list_lock);
727
728 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729}
730
731/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300732 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700733 */
734enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300735 Opt_queue_depth,
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400736 Opt_lock_timeout,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700737 Opt_last_int,
738 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200739 Opt_pool_ns,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700740 Opt_last_string,
741 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700742 Opt_read_only,
743 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200744 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200745 Opt_exclusive,
Ilya Dryomovd9360542018-03-23 06:14:47 +0100746 Opt_notrim,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300747 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700748};
749
Alex Elder43ae4702012-07-03 16:01:18 -0500750static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300751 {Opt_queue_depth, "queue_depth=%d"},
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400752 {Opt_lock_timeout, "lock_timeout=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700753 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200754 {Opt_pool_ns, "_pool_ns=%s"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700755 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500756 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700757 {Opt_read_only, "ro"}, /* Alternate spelling */
758 {Opt_read_write, "read_write"},
759 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200760 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200761 {Opt_exclusive, "exclusive"},
Ilya Dryomovd9360542018-03-23 06:14:47 +0100762 {Opt_notrim, "notrim"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300763 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700764};
765
Alex Elder98571b52013-01-20 14:44:42 -0600766struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300767 int queue_depth;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400768 unsigned long lock_timeout;
Alex Elder98571b52013-01-20 14:44:42 -0600769 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200770 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200771 bool exclusive;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100772 bool trim;
Alex Elder98571b52013-01-20 14:44:42 -0600773};
774
Ilya Dryomovb5584182015-06-23 16:21:19 +0300775#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400776#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
Alex Elder98571b52013-01-20 14:44:42 -0600777#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200778#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200779#define RBD_EXCLUSIVE_DEFAULT false
Ilya Dryomovd9360542018-03-23 06:14:47 +0100780#define RBD_TRIM_DEFAULT true
Alex Elder98571b52013-01-20 14:44:42 -0600781
Ilya Dryomovc3001562018-07-03 15:28:43 +0200782struct parse_rbd_opts_ctx {
783 struct rbd_spec *spec;
784 struct rbd_options *opts;
785};
786
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700787static int parse_rbd_opts_token(char *c, void *private)
788{
Ilya Dryomovc3001562018-07-03 15:28:43 +0200789 struct parse_rbd_opts_ctx *pctx = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700790 substring_t argstr[MAX_OPT_ARGS];
791 int token, intval, ret;
792
Alex Elder43ae4702012-07-03 16:01:18 -0500793 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700794 if (token < Opt_last_int) {
795 ret = match_int(&argstr[0], &intval);
796 if (ret < 0) {
Ilya Dryomov2f56b6b2018-06-27 16:38:13 +0200797 pr_err("bad option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700798 return ret;
799 }
800 dout("got int token %d val %d\n", token, intval);
801 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300802 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700803 } else {
804 dout("got token %d\n", token);
805 }
806
807 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300808 case Opt_queue_depth:
809 if (intval < 1) {
810 pr_err("queue_depth out of range\n");
811 return -EINVAL;
812 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200813 pctx->opts->queue_depth = intval;
Ilya Dryomovb5584182015-06-23 16:21:19 +0300814 break;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400815 case Opt_lock_timeout:
816 /* 0 is "wait forever" (i.e. infinite timeout) */
817 if (intval < 0 || intval > INT_MAX / 1000) {
818 pr_err("lock_timeout out of range\n");
819 return -EINVAL;
820 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200821 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400822 break;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200823 case Opt_pool_ns:
824 kfree(pctx->spec->pool_ns);
825 pctx->spec->pool_ns = match_strdup(argstr);
826 if (!pctx->spec->pool_ns)
827 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700828 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700829 case Opt_read_only:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200830 pctx->opts->read_only = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700831 break;
832 case Opt_read_write:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200833 pctx->opts->read_only = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700834 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200835 case Opt_lock_on_read:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200836 pctx->opts->lock_on_read = true;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200837 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200838 case Opt_exclusive:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200839 pctx->opts->exclusive = true;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200840 break;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100841 case Opt_notrim:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200842 pctx->opts->trim = false;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100843 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700844 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300845 /* libceph prints "bad option" msg */
846 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700847 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300848
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700849 return 0;
850}
851
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800852static char* obj_op_name(enum obj_operation_type op_type)
853{
854 switch (op_type) {
855 case OBJ_OP_READ:
856 return "read";
857 case OBJ_OP_WRITE:
858 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800859 case OBJ_OP_DISCARD:
860 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800861 default:
862 return "???";
863 }
864}
865
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700866/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700867 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600868 *
Alex Elder432b8582012-01-29 13:57:44 -0600869 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700870 */
871static void rbd_client_release(struct kref *kref)
872{
873 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
874
Alex Elder37206ee2013-02-20 17:32:08 -0600875 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500876 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700877 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500878 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700879
880 ceph_destroy_client(rbdc->client);
881 kfree(rbdc);
882}
883
884/*
885 * Drop reference to ceph client node. If it's not referenced anymore, release
886 * it.
887 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500888static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889{
Alex Elderc53d5892012-10-25 23:34:42 -0500890 if (rbdc)
891 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892}
893
Ilya Dryomovdd435852018-02-22 13:43:24 +0100894static int wait_for_latest_osdmap(struct ceph_client *client)
895{
896 u64 newest_epoch;
897 int ret;
898
899 ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
900 if (ret)
901 return ret;
902
903 if (client->osdc.osdmap->epoch >= newest_epoch)
904 return 0;
905
906 ceph_osdc_maybe_request_map(&client->osdc);
907 return ceph_monc_wait_osdmap(&client->monc, newest_epoch,
908 client->options->mount_timeout);
909}
910
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100911/*
912 * Get a ceph client with specific addr and configuration, if one does
913 * not exist create it. Either way, ceph_opts is consumed by this
914 * function.
915 */
916static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
917{
918 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100919 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100920
921 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
922 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100923 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100924 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100925
926 /*
927 * Using an existing client. Make sure ->pg_pools is up to
928 * date before we look up the pool id in do_rbd_add().
929 */
930 ret = wait_for_latest_osdmap(rbdc->client);
931 if (ret) {
932 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
933 rbd_put_client(rbdc);
934 rbdc = ERR_PTR(ret);
935 }
936 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100937 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100938 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100939 mutex_unlock(&client_mutex);
940
941 return rbdc;
942}
943
Alex Eldera30b71b2012-07-10 20:30:11 -0500944static bool rbd_image_format_valid(u32 image_format)
945{
946 return image_format == 1 || image_format == 2;
947}
948
Alex Elder8e94af82012-07-25 09:32:40 -0500949static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
950{
Alex Elder103a1502012-08-02 11:29:45 -0500951 size_t size;
952 u32 snap_count;
953
954 /* The header has to start with the magic rbd header text */
955 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
956 return false;
957
Alex Elderdb2388b2012-10-20 22:17:27 -0500958 /* The bio layer requires at least sector-sized I/O */
959
960 if (ondisk->options.order < SECTOR_SHIFT)
961 return false;
962
963 /* If we use u64 in a few spots we may be able to loosen this */
964
965 if (ondisk->options.order > 8 * sizeof (int) - 1)
966 return false;
967
Alex Elder103a1502012-08-02 11:29:45 -0500968 /*
969 * The size of a snapshot header has to fit in a size_t, and
970 * that limits the number of snapshots.
971 */
972 snap_count = le32_to_cpu(ondisk->snap_count);
973 size = SIZE_MAX - sizeof (struct ceph_snap_context);
974 if (snap_count > size / sizeof (__le64))
975 return false;
976
977 /*
978 * Not only that, but the size of the entire the snapshot
979 * header must also be representable in a size_t.
980 */
981 size -= snap_count * sizeof (__le64);
982 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
983 return false;
984
985 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500986}
987
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700988/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100989 * returns the size of an object in the image
990 */
991static u32 rbd_obj_bytes(struct rbd_image_header *header)
992{
993 return 1U << header->obj_order;
994}
995
Ilya Dryomov263423f2017-01-25 18:16:22 +0100996static void rbd_init_layout(struct rbd_device *rbd_dev)
997{
998 if (rbd_dev->header.stripe_unit == 0 ||
999 rbd_dev->header.stripe_count == 0) {
1000 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1001 rbd_dev->header.stripe_count = 1;
1002 }
1003
1004 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1005 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1006 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +01001007 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1008 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001009 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1010}
1011
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001012/*
Alex Elderbb23e372013-05-06 09:51:29 -05001013 * Fill an rbd image header with information from the given format 1
1014 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015 */
Alex Elder662518b2013-05-06 09:51:29 -05001016static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001017 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001018{
Alex Elder662518b2013-05-06 09:51:29 -05001019 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001020 bool first_time = header->object_prefix == NULL;
1021 struct ceph_snap_context *snapc;
1022 char *object_prefix = NULL;
1023 char *snap_names = NULL;
1024 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001025 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001026 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001027 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028
Alex Elderbb23e372013-05-06 09:51:29 -05001029 /* Allocate this now to avoid having to handle failure below */
1030
1031 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001032 object_prefix = kstrndup(ondisk->object_prefix,
1033 sizeof(ondisk->object_prefix),
1034 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001035 if (!object_prefix)
1036 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001037 }
1038
1039 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001040
Alex Elder103a1502012-08-02 11:29:45 -05001041 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001042 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1043 if (!snapc)
1044 goto out_err;
1045 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001046 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001047 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001048 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1049
Alex Elderbb23e372013-05-06 09:51:29 -05001050 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001051
Alex Elderbb23e372013-05-06 09:51:29 -05001052 if (snap_names_len > (u64)SIZE_MAX)
1053 goto out_2big;
1054 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1055 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001056 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001057
1058 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001059 snap_sizes = kmalloc_array(snap_count,
1060 sizeof(*header->snap_sizes),
1061 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001062 if (!snap_sizes)
1063 goto out_err;
1064
Alex Elderf785cc12012-08-23 23:22:06 -05001065 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001066 * Copy the names, and fill in each snapshot's id
1067 * and size.
1068 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001069 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001070 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001071 * snap_names_len bytes beyond the end of the
1072 * snapshot id array, this memcpy() is safe.
1073 */
Alex Elderbb23e372013-05-06 09:51:29 -05001074 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1075 snaps = ondisk->snaps;
1076 for (i = 0; i < snap_count; i++) {
1077 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1078 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1079 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 }
Alex Elder849b4262012-07-09 21:04:24 -05001081
Alex Elderbb23e372013-05-06 09:51:29 -05001082 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001083
Alex Elderbb23e372013-05-06 09:51:29 -05001084 if (first_time) {
1085 header->object_prefix = object_prefix;
1086 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001087 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001088 } else {
1089 ceph_put_snap_context(header->snapc);
1090 kfree(header->snap_names);
1091 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001092 }
1093
1094 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001095
Alex Elderf84344f2012-08-31 17:29:51 -05001096 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001097 header->snapc = snapc;
1098 header->snap_names = snap_names;
1099 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001100
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001101 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001102out_2big:
1103 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001104out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001105 kfree(snap_sizes);
1106 kfree(snap_names);
1107 ceph_put_snap_context(snapc);
1108 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001109
Alex Elderbb23e372013-05-06 09:51:29 -05001110 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001111}
1112
Alex Elder9682fc62013-04-30 00:44:33 -05001113static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1114{
1115 const char *snap_name;
1116
1117 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1118
1119 /* Skip over names until we find the one we are looking for */
1120
1121 snap_name = rbd_dev->header.snap_names;
1122 while (which--)
1123 snap_name += strlen(snap_name) + 1;
1124
1125 return kstrdup(snap_name, GFP_KERNEL);
1126}
1127
Alex Elder30d1cff2013-05-01 12:43:03 -05001128/*
1129 * Snapshot id comparison function for use with qsort()/bsearch().
1130 * Note that result is for snapshots in *descending* order.
1131 */
1132static int snapid_compare_reverse(const void *s1, const void *s2)
1133{
1134 u64 snap_id1 = *(u64 *)s1;
1135 u64 snap_id2 = *(u64 *)s2;
1136
1137 if (snap_id1 < snap_id2)
1138 return 1;
1139 return snap_id1 == snap_id2 ? 0 : -1;
1140}
1141
1142/*
1143 * Search a snapshot context to see if the given snapshot id is
1144 * present.
1145 *
1146 * Returns the position of the snapshot id in the array if it's found,
1147 * or BAD_SNAP_INDEX otherwise.
1148 *
1149 * Note: The snapshot array is in kept sorted (by the osd) in
1150 * reverse order, highest snapshot id first.
1151 */
Alex Elder9682fc62013-04-30 00:44:33 -05001152static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1153{
1154 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001155 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001156
Alex Elder30d1cff2013-05-01 12:43:03 -05001157 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1158 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001159
Alex Elder30d1cff2013-05-01 12:43:03 -05001160 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001161}
1162
Alex Elder2ad3d712013-04-30 00:44:33 -05001163static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1164 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001165{
1166 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001167 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001168
1169 which = rbd_dev_snap_index(rbd_dev, snap_id);
1170 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001171 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001172
Josh Durginda6a6b62013-09-04 17:57:31 -07001173 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1174 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001175}
1176
Alex Elder9e15b772012-10-30 19:40:33 -05001177static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1178{
Alex Elder9e15b772012-10-30 19:40:33 -05001179 if (snap_id == CEPH_NOSNAP)
1180 return RBD_SNAP_HEAD_NAME;
1181
Alex Elder54cac612013-04-30 00:44:33 -05001182 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1183 if (rbd_dev->image_format == 1)
1184 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001185
Alex Elder54cac612013-04-30 00:44:33 -05001186 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001187}
1188
Alex Elder2ad3d712013-04-30 00:44:33 -05001189static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1190 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001191{
Alex Elder2ad3d712013-04-30 00:44:33 -05001192 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1193 if (snap_id == CEPH_NOSNAP) {
1194 *snap_size = rbd_dev->header.image_size;
1195 } else if (rbd_dev->image_format == 1) {
1196 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001197
Alex Elder2ad3d712013-04-30 00:44:33 -05001198 which = rbd_dev_snap_index(rbd_dev, snap_id);
1199 if (which == BAD_SNAP_INDEX)
1200 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001201
Alex Elder2ad3d712013-04-30 00:44:33 -05001202 *snap_size = rbd_dev->header.snap_sizes[which];
1203 } else {
1204 u64 size = 0;
1205 int ret;
1206
1207 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1208 if (ret)
1209 return ret;
1210
1211 *snap_size = size;
1212 }
1213 return 0;
1214}
1215
1216static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1217 u64 *snap_features)
1218{
1219 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1220 if (snap_id == CEPH_NOSNAP) {
1221 *snap_features = rbd_dev->header.features;
1222 } else if (rbd_dev->image_format == 1) {
1223 *snap_features = 0; /* No features for format 1 */
1224 } else {
1225 u64 features = 0;
1226 int ret;
1227
1228 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1229 if (ret)
1230 return ret;
1231
1232 *snap_features = features;
1233 }
1234 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001235}
1236
Alex Elderd1cf5782013-04-27 09:59:30 -05001237static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001238{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001239 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001240 u64 size = 0;
1241 u64 features = 0;
1242 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001243
Alex Elder2ad3d712013-04-30 00:44:33 -05001244 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1245 if (ret)
1246 return ret;
1247 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1248 if (ret)
1249 return ret;
1250
1251 rbd_dev->mapping.size = size;
1252 rbd_dev->mapping.features = features;
1253
Alex Elder8b0241f2013-04-25 23:15:08 -05001254 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255}
1256
Alex Elderd1cf5782013-04-27 09:59:30 -05001257static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1258{
1259 rbd_dev->mapping.size = 0;
1260 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001261}
1262
Ilya Dryomov5359a172018-01-20 10:30:10 +01001263static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001264{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001265 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001266 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001267
Ilya Dryomov5359a172018-01-20 10:30:10 +01001268 buf = bvec_kmap_irq(bv, &flags);
1269 memset(buf, 0, bv->bv_len);
1270 flush_dcache_page(bv->bv_page);
1271 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272}
1273
Ilya Dryomov5359a172018-01-20 10:30:10 +01001274static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001275{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001276 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001277
Ilya Dryomov5359a172018-01-20 10:30:10 +01001278 ceph_bio_iter_advance(&it, off);
1279 ceph_bio_iter_advance_step(&it, bytes, ({
1280 zero_bvec(&bv);
1281 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001282}
1283
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001284static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001285{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001286 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001287
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001288 ceph_bvec_iter_advance(&it, off);
1289 ceph_bvec_iter_advance_step(&it, bytes, ({
1290 zero_bvec(&bv);
1291 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001292}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001293
Alex Elderf7760da2012-10-20 22:17:27 -05001294/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001295 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001296 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001297 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001298 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001299 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001300static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1301 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001302{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001303 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001304 case OBJ_REQUEST_BIO:
1305 zero_bios(&obj_req->bio_pos, off, bytes);
1306 break;
1307 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001308 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001309 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1310 break;
1311 default:
1312 rbd_assert(0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001313 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001314}
1315
1316static void rbd_obj_request_destroy(struct kref *kref);
1317static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1318{
1319 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001320 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001321 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001322 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1323}
1324
Alex Elder0f2d5be2014-04-26 14:21:44 +04001325static void rbd_img_request_get(struct rbd_img_request *img_request)
1326{
1327 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001328 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001329 kref_get(&img_request->kref);
1330}
1331
Alex Elderbf0d5f502012-11-22 00:00:08 -06001332static void rbd_img_request_destroy(struct kref *kref);
1333static void rbd_img_request_put(struct rbd_img_request *img_request)
1334{
1335 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001336 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001337 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001338 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001339}
1340
1341static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1342 struct rbd_obj_request *obj_request)
1343{
Alex Elder25dcf952013-01-25 17:08:55 -06001344 rbd_assert(obj_request->img_request == NULL);
1345
Alex Elderb155e862013-04-15 14:50:37 -05001346 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001347 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001348 img_request->obj_request_count++;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01001349 img_request->pending_count++;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001350 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001351}
1352
1353static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1354 struct rbd_obj_request *obj_request)
1355{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001356 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001357 list_del(&obj_request->ex.oe_item);
Alex Elder25dcf952013-01-25 17:08:55 -06001358 rbd_assert(img_request->obj_request_count > 0);
1359 img_request->obj_request_count--;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001360 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001361 rbd_obj_request_put(obj_request);
1362}
1363
Ilya Dryomov980917f2016-09-12 18:59:42 +02001364static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001365{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001366 struct ceph_osd_request *osd_req = obj_request->osd_req;
1367
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001368 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001369 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1370 obj_request->ex.oe_len, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001371 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001372}
1373
Alex Elder0c425242013-02-08 09:55:49 -06001374/*
1375 * The default/initial value for all image request flags is 0. Each
1376 * is conditionally set to 1 at image request initialization time
1377 * and currently never change thereafter.
1378 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001379static void img_request_layered_set(struct rbd_img_request *img_request)
1380{
1381 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1382 smp_mb();
1383}
1384
Alex Eldera2acd002013-05-08 22:50:04 -05001385static void img_request_layered_clear(struct rbd_img_request *img_request)
1386{
1387 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1388 smp_mb();
1389}
1390
Alex Elderd0b2e942013-01-24 16:13:36 -06001391static bool img_request_layered_test(struct rbd_img_request *img_request)
1392{
1393 smp_mb();
1394 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1395}
1396
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001397static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001398{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001399 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1400
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001401 return !obj_req->ex.oe_off &&
1402 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001403}
1404
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001405static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001406{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001407 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001408
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001409 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001410 rbd_dev->layout.object_size;
1411}
1412
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001413static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1414{
1415 return ceph_file_extents_bytes(obj_req->img_extents,
1416 obj_req->num_img_extents);
1417}
1418
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001419static bool rbd_img_is_write(struct rbd_img_request *img_req)
1420{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001421 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001422 case OBJ_OP_READ:
1423 return false;
1424 case OBJ_OP_WRITE:
1425 case OBJ_OP_DISCARD:
1426 return true;
1427 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02001428 BUG();
Alex Elder6e2a4502013-03-27 09:16:30 -05001429 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001430}
1431
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001432static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
Ilya Dryomov27617132015-07-16 17:36:11 +03001433
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001434static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001435{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001436 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001437
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001438 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1439 osd_req->r_result, obj_req);
1440 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001441
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001442 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1443 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1444 obj_req->xferred = osd_req->r_result;
1445 else
1446 /*
1447 * Writes aren't allowed to return a data payload. In some
1448 * guarded write cases (e.g. stat + zero on an empty object)
1449 * a stat response makes it through, but we don't care.
1450 */
1451 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001452
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001453 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001454}
1455
Alex Elder9d4df012013-04-19 15:34:50 -05001456static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001457{
Alex Elder8c042b02013-04-03 01:28:58 -05001458 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001459
Ilya Dryomova162b302018-01-30 17:52:10 +01001460 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001461 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001462}
1463
1464static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1465{
Alex Elder9d4df012013-04-19 15:34:50 -05001466 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001467
Ilya Dryomova162b302018-01-30 17:52:10 +01001468 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001469 ktime_get_real_ts64(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001470 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001471}
1472
Ilya Dryomovbc812072017-01-25 18:16:23 +01001473static struct ceph_osd_request *
Ilya Dryomova162b302018-01-30 17:52:10 +01001474rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001475{
Ilya Dryomova162b302018-01-30 17:52:10 +01001476 struct rbd_img_request *img_req = obj_req->img_request;
1477 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001478 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1479 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001480 const char *name_format = rbd_dev->image_format == 1 ?
1481 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001482
Ilya Dryomova162b302018-01-30 17:52:10 +01001483 req = ceph_osdc_alloc_request(osdc,
1484 (rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1485 num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001486 if (!req)
1487 return NULL;
1488
Ilya Dryomovbc812072017-01-25 18:16:23 +01001489 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001490 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001491
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001492 /*
1493 * Data objects may be stored in a separate pool, but always in
1494 * the same namespace in that pool as the header in its pool.
1495 */
1496 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001497 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001498
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001499 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001500 rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001501 goto err_req;
1502
1503 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1504 goto err_req;
1505
1506 return req;
1507
1508err_req:
1509 ceph_osdc_put_request(req);
1510 return NULL;
1511}
1512
Alex Elderbf0d5f502012-11-22 00:00:08 -06001513static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1514{
1515 ceph_osdc_put_request(osd_req);
1516}
1517
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001518static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001519{
1520 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001521
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001522 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001523 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001524 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001525
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001526 ceph_object_extent_init(&obj_request->ex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001527 kref_init(&obj_request->kref);
1528
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001529 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001530 return obj_request;
1531}
1532
1533static void rbd_obj_request_destroy(struct kref *kref)
1534{
1535 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001536 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001537
1538 obj_request = container_of(kref, struct rbd_obj_request, kref);
1539
Alex Elder37206ee2013-02-20 17:32:08 -06001540 dout("%s: obj %p\n", __func__, obj_request);
1541
Alex Elderbf0d5f502012-11-22 00:00:08 -06001542 if (obj_request->osd_req)
1543 rbd_osd_req_destroy(obj_request->osd_req);
1544
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001545 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001546 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001547 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001548 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001549 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001550 case OBJ_REQUEST_OWN_BVECS:
1551 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001552 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001553 default:
1554 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001555 }
1556
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001557 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001558 if (obj_request->copyup_bvecs) {
1559 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1560 if (obj_request->copyup_bvecs[i].bv_page)
1561 __free_page(obj_request->copyup_bvecs[i].bv_page);
1562 }
1563 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001564 }
1565
Alex Elder868311b2013-05-01 12:43:03 -05001566 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567}
1568
Alex Elderfb65d2282013-05-08 22:50:04 -05001569/* It's OK to call this for a device with no parent */
1570
1571static void rbd_spec_put(struct rbd_spec *spec);
1572static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1573{
1574 rbd_dev_remove_parent(rbd_dev);
1575 rbd_spec_put(rbd_dev->parent_spec);
1576 rbd_dev->parent_spec = NULL;
1577 rbd_dev->parent_overlap = 0;
1578}
1579
Alex Elderbf0d5f502012-11-22 00:00:08 -06001580/*
Alex Eldera2acd002013-05-08 22:50:04 -05001581 * Parent image reference counting is used to determine when an
1582 * image's parent fields can be safely torn down--after there are no
1583 * more in-flight requests to the parent image. When the last
1584 * reference is dropped, cleaning them up is safe.
1585 */
1586static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1587{
1588 int counter;
1589
1590 if (!rbd_dev->parent_spec)
1591 return;
1592
1593 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1594 if (counter > 0)
1595 return;
1596
1597 /* Last reference; clean up parent data structures */
1598
1599 if (!counter)
1600 rbd_dev_unparent(rbd_dev);
1601 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001602 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001603}
1604
1605/*
1606 * If an image has a non-zero parent overlap, get a reference to its
1607 * parent.
1608 *
1609 * Returns true if the rbd device has a parent with a non-zero
1610 * overlap and a reference for it was successfully taken, or
1611 * false otherwise.
1612 */
1613static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1614{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001615 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001616
1617 if (!rbd_dev->parent_spec)
1618 return false;
1619
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001620 down_read(&rbd_dev->header_rwsem);
1621 if (rbd_dev->parent_overlap)
1622 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1623 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001624
1625 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001626 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001627
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001628 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001629}
1630
Alex Elderbf0d5f502012-11-22 00:00:08 -06001631/*
1632 * Caller is responsible for filling in the list of object requests
1633 * that comprises the image request, and the Linux request pointer
1634 * (if there is one).
1635 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001636static struct rbd_img_request *rbd_img_request_create(
1637 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001638 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001639 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001640{
1641 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001642
Ilya Dryomova0c58952018-01-22 16:03:06 +01001643 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001644 if (!img_request)
1645 return NULL;
1646
Alex Elderbf0d5f502012-11-22 00:00:08 -06001647 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001648 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001649 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001650 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001651 else
1652 img_request->snapc = snapc;
1653
Alex Eldera2acd002013-05-08 22:50:04 -05001654 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001655 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001656
Alex Elderbf0d5f502012-11-22 00:00:08 -06001657 spin_lock_init(&img_request->completion_lock);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001658 INIT_LIST_HEAD(&img_request->object_extents);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001659 kref_init(&img_request->kref);
1660
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001661 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1662 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001663 return img_request;
1664}
1665
1666static void rbd_img_request_destroy(struct kref *kref)
1667{
1668 struct rbd_img_request *img_request;
1669 struct rbd_obj_request *obj_request;
1670 struct rbd_obj_request *next_obj_request;
1671
1672 img_request = container_of(kref, struct rbd_img_request, kref);
1673
Alex Elder37206ee2013-02-20 17:32:08 -06001674 dout("%s: img %p\n", __func__, img_request);
1675
Alex Elderbf0d5f502012-11-22 00:00:08 -06001676 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1677 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001678 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001679
Alex Eldera2acd002013-05-08 22:50:04 -05001680 if (img_request_layered_test(img_request)) {
1681 img_request_layered_clear(img_request);
1682 rbd_dev_parent_put(img_request->rbd_dev);
1683 }
1684
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001685 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001686 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001687
Alex Elder1c2a9df2013-05-01 12:43:03 -05001688 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001689}
1690
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001691static void prune_extents(struct ceph_file_extent *img_extents,
1692 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05001693{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001694 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05001695
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001696 /* drop extents completely beyond the overlap */
1697 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1698 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05001699
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001700 if (cnt) {
1701 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05001702
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001703 /* trim final overlapping extent */
1704 if (ex->fe_off + ex->fe_len > overlap)
1705 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06001706 }
1707
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001708 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05001709}
1710
Alex Elderf1a47392013-04-19 15:34:50 -05001711/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001712 * Determine the byte range(s) covered by either just the object extent
1713 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001714 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001715static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1716 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001717{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001718 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001719 int ret;
1720
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001721 if (!rbd_dev->parent_overlap)
1722 return 0;
1723
1724 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1725 entire ? 0 : obj_req->ex.oe_off,
1726 entire ? rbd_dev->layout.object_size :
1727 obj_req->ex.oe_len,
1728 &obj_req->img_extents,
1729 &obj_req->num_img_extents);
1730 if (ret)
1731 return ret;
1732
1733 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1734 rbd_dev->parent_overlap);
1735 return 0;
1736}
1737
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001738static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1739{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001740 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001741 case OBJ_REQUEST_BIO:
1742 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1743 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001744 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001745 break;
1746 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001747 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001748 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001749 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001750 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001751 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1752 &obj_req->bvec_pos);
1753 break;
1754 default:
1755 rbd_assert(0);
1756 }
1757}
1758
1759static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1760{
Ilya Dryomova162b302018-01-30 17:52:10 +01001761 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001762 if (!obj_req->osd_req)
Ilya Dryomov710214e2016-09-15 17:53:32 +02001763 return -ENOMEM;
1764
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001765 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001766 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001767 rbd_osd_req_setup_data(obj_req, 0);
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001768
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001769 rbd_osd_req_format_read(obj_req);
1770 return 0;
1771}
1772
1773static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1774 unsigned int which)
1775{
1776 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001777
Alex Elderc5b5ef62013-02-11 12:33:24 -06001778 /*
1779 * The response data for a STAT call consists of:
1780 * le64 length;
1781 * struct {
1782 * le32 tv_sec;
1783 * le32 tv_nsec;
1784 * } mtime;
1785 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001786 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1787 if (IS_ERR(pages))
1788 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06001789
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001790 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1791 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1792 8 + sizeof(struct ceph_timespec),
1793 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001794 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001795}
1796
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001797static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1798 unsigned int which)
Alex Elderb454e362013-04-19 15:34:50 -05001799{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001800 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1801 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05001802
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001803 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
1804 rbd_dev->layout.object_size,
1805 rbd_dev->layout.object_size);
Alex Elderb454e362013-04-19 15:34:50 -05001806
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001807 if (rbd_obj_is_entire(obj_req))
1808 opcode = CEPH_OSD_OP_WRITEFULL;
1809 else
1810 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001811
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001812 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001813 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001814 rbd_osd_req_setup_data(obj_req, which++);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001815
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001816 rbd_assert(which == obj_req->osd_req->r_num_ops);
1817 rbd_osd_req_format_write(obj_req);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001818}
1819
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001820static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001821{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001822 unsigned int num_osd_ops, which = 0;
1823 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02001824
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001825 /* reverse map the entire object onto the parent */
1826 ret = rbd_obj_calc_img_extents(obj_req, true);
1827 if (ret)
1828 return ret;
1829
1830 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001831 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1832 num_osd_ops = 3; /* stat + setallochint + write/writefull */
1833 } else {
1834 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1835 num_osd_ops = 2; /* setallochint + write/writefull */
1836 }
1837
Ilya Dryomova162b302018-01-30 17:52:10 +01001838 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001839 if (!obj_req->osd_req)
1840 return -ENOMEM;
1841
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001842 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001843 ret = __rbd_obj_setup_stat(obj_req, which++);
1844 if (ret)
1845 return ret;
1846 }
1847
1848 __rbd_obj_setup_write(obj_req, which);
1849 return 0;
1850}
1851
1852static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
1853 unsigned int which)
1854{
1855 u16 opcode;
1856
1857 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001858 if (obj_req->num_img_extents) {
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001859 osd_req_op_init(obj_req->osd_req, which++,
1860 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001861 opcode = CEPH_OSD_OP_TRUNCATE;
1862 } else {
1863 osd_req_op_init(obj_req->osd_req, which++,
1864 CEPH_OSD_OP_DELETE, 0);
1865 opcode = 0;
1866 }
1867 } else if (rbd_obj_is_tail(obj_req)) {
1868 opcode = CEPH_OSD_OP_TRUNCATE;
1869 } else {
1870 opcode = CEPH_OSD_OP_ZERO;
1871 }
1872
1873 if (opcode)
1874 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001875 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001876 0, 0);
1877
1878 rbd_assert(which == obj_req->osd_req->r_num_ops);
1879 rbd_osd_req_format_write(obj_req);
1880}
1881
1882static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1883{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001884 unsigned int num_osd_ops, which = 0;
1885 int ret;
1886
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001887 /* reverse map the entire object onto the parent */
1888 ret = rbd_obj_calc_img_extents(obj_req, true);
1889 if (ret)
1890 return ret;
1891
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001892 if (rbd_obj_is_entire(obj_req)) {
1893 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001894 if (obj_req->num_img_extents)
1895 num_osd_ops = 2; /* create + truncate */
1896 else
1897 num_osd_ops = 1; /* delete */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001898 } else {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001899 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001900 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1901 num_osd_ops = 2; /* stat + truncate/zero */
1902 } else {
1903 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1904 num_osd_ops = 1; /* truncate/zero */
1905 }
1906 }
1907
Ilya Dryomova162b302018-01-30 17:52:10 +01001908 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001909 if (!obj_req->osd_req)
1910 return -ENOMEM;
1911
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001912 if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001913 ret = __rbd_obj_setup_stat(obj_req, which++);
1914 if (ret)
1915 return ret;
1916 }
1917
1918 __rbd_obj_setup_discard(obj_req, which);
1919 return 0;
1920}
1921
1922/*
1923 * For each object request in @img_req, allocate an OSD request, add
1924 * individual OSD ops and prepare them for submission. The number of
1925 * OSD ops depends on op_type and the overlap point (if any).
1926 */
1927static int __rbd_img_fill_request(struct rbd_img_request *img_req)
1928{
1929 struct rbd_obj_request *obj_req;
1930 int ret;
1931
1932 for_each_obj_request(img_req, obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001933 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001934 case OBJ_OP_READ:
1935 ret = rbd_obj_setup_read(obj_req);
1936 break;
1937 case OBJ_OP_WRITE:
1938 ret = rbd_obj_setup_write(obj_req);
1939 break;
1940 case OBJ_OP_DISCARD:
1941 ret = rbd_obj_setup_discard(obj_req);
1942 break;
1943 default:
1944 rbd_assert(0);
1945 }
1946 if (ret)
1947 return ret;
1948 }
1949
1950 return 0;
1951}
1952
Ilya Dryomov5a237812018-02-06 19:26:34 +01001953union rbd_img_fill_iter {
1954 struct ceph_bio_iter bio_iter;
1955 struct ceph_bvec_iter bvec_iter;
1956};
1957
1958struct rbd_img_fill_ctx {
1959 enum obj_request_type pos_type;
1960 union rbd_img_fill_iter *pos;
1961 union rbd_img_fill_iter iter;
1962 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01001963 ceph_object_extent_fn_t count_fn;
1964 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01001965};
1966
1967static struct ceph_object_extent *alloc_object_extent(void *arg)
1968{
1969 struct rbd_img_request *img_req = arg;
1970 struct rbd_obj_request *obj_req;
1971
1972 obj_req = rbd_obj_request_create();
1973 if (!obj_req)
1974 return NULL;
1975
1976 rbd_img_obj_request_add(img_req, obj_req);
1977 return &obj_req->ex;
1978}
1979
1980/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01001981 * While su != os && sc == 1 is technically not fancy (it's the same
1982 * layout as su == os && sc == 1), we can't use the nocopy path for it
1983 * because ->set_pos_fn() should be called only once per object.
1984 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
1985 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01001986 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001987static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
1988{
1989 return l->stripe_unit != l->object_size;
1990}
1991
1992static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
1993 struct ceph_file_extent *img_extents,
1994 u32 num_img_extents,
1995 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01001996{
1997 u32 i;
1998 int ret;
1999
2000 img_req->data_type = fctx->pos_type;
2001
2002 /*
2003 * Create object requests and set each object request's starting
2004 * position in the provided bio (list) or bio_vec array.
2005 */
2006 fctx->iter = *fctx->pos;
2007 for (i = 0; i < num_img_extents; i++) {
2008 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2009 img_extents[i].fe_off,
2010 img_extents[i].fe_len,
2011 &img_req->object_extents,
2012 alloc_object_extent, img_req,
2013 fctx->set_pos_fn, &fctx->iter);
2014 if (ret)
2015 return ret;
2016 }
2017
2018 return __rbd_img_fill_request(img_req);
2019}
2020
Ilya Dryomovafb97882018-02-06 19:26:35 +01002021/*
2022 * Map a list of image extents to a list of object extents, create the
2023 * corresponding object requests (normally each to a different object,
2024 * but not always) and add them to @img_req. For each object request,
2025 * set up its data descriptor to point to the corresponding chunk(s) of
2026 * @fctx->pos data buffer.
2027 *
2028 * Because ceph_file_to_extents() will merge adjacent object extents
2029 * together, each object request's data descriptor may point to multiple
2030 * different chunks of @fctx->pos data buffer.
2031 *
2032 * @fctx->pos data buffer is assumed to be large enough.
2033 */
2034static int rbd_img_fill_request(struct rbd_img_request *img_req,
2035 struct ceph_file_extent *img_extents,
2036 u32 num_img_extents,
2037 struct rbd_img_fill_ctx *fctx)
2038{
2039 struct rbd_device *rbd_dev = img_req->rbd_dev;
2040 struct rbd_obj_request *obj_req;
2041 u32 i;
2042 int ret;
2043
2044 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2045 !rbd_layout_is_fancy(&rbd_dev->layout))
2046 return rbd_img_fill_request_nocopy(img_req, img_extents,
2047 num_img_extents, fctx);
2048
2049 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2050
2051 /*
2052 * Create object requests and determine ->bvec_count for each object
2053 * request. Note that ->bvec_count sum over all object requests may
2054 * be greater than the number of bio_vecs in the provided bio (list)
2055 * or bio_vec array because when mapped, those bio_vecs can straddle
2056 * stripe unit boundaries.
2057 */
2058 fctx->iter = *fctx->pos;
2059 for (i = 0; i < num_img_extents; i++) {
2060 ret = ceph_file_to_extents(&rbd_dev->layout,
2061 img_extents[i].fe_off,
2062 img_extents[i].fe_len,
2063 &img_req->object_extents,
2064 alloc_object_extent, img_req,
2065 fctx->count_fn, &fctx->iter);
2066 if (ret)
2067 return ret;
2068 }
2069
2070 for_each_obj_request(img_req, obj_req) {
2071 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2072 sizeof(*obj_req->bvec_pos.bvecs),
2073 GFP_NOIO);
2074 if (!obj_req->bvec_pos.bvecs)
2075 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002076 }
2077
2078 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002079 * Fill in each object request's private bio_vec array, splitting and
2080 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002081 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002082 fctx->iter = *fctx->pos;
2083 for (i = 0; i < num_img_extents; i++) {
2084 ret = ceph_iterate_extents(&rbd_dev->layout,
2085 img_extents[i].fe_off,
2086 img_extents[i].fe_len,
2087 &img_req->object_extents,
2088 fctx->copy_fn, &fctx->iter);
2089 if (ret)
2090 return ret;
2091 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002092
Ilya Dryomovafb97882018-02-06 19:26:35 +01002093 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002094}
2095
Ilya Dryomov5a237812018-02-06 19:26:34 +01002096static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2097 u64 off, u64 len)
2098{
2099 struct ceph_file_extent ex = { off, len };
Arnd Bergmann138ed222020-01-07 22:01:04 +01002100 union rbd_img_fill_iter dummy = {};
Ilya Dryomov5a237812018-02-06 19:26:34 +01002101 struct rbd_img_fill_ctx fctx = {
2102 .pos_type = OBJ_REQUEST_NODATA,
2103 .pos = &dummy,
2104 };
2105
2106 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2107}
2108
2109static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2110{
2111 struct rbd_obj_request *obj_req =
2112 container_of(ex, struct rbd_obj_request, ex);
2113 struct ceph_bio_iter *it = arg;
2114
2115 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2116 obj_req->bio_pos = *it;
2117 ceph_bio_iter_advance(it, bytes);
2118}
2119
Ilya Dryomovafb97882018-02-06 19:26:35 +01002120static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2121{
2122 struct rbd_obj_request *obj_req =
2123 container_of(ex, struct rbd_obj_request, ex);
2124 struct ceph_bio_iter *it = arg;
2125
2126 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2127 ceph_bio_iter_advance_step(it, bytes, ({
2128 obj_req->bvec_count++;
2129 }));
2130
2131}
2132
2133static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2134{
2135 struct rbd_obj_request *obj_req =
2136 container_of(ex, struct rbd_obj_request, ex);
2137 struct ceph_bio_iter *it = arg;
2138
2139 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2140 ceph_bio_iter_advance_step(it, bytes, ({
2141 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2142 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2143 }));
2144}
2145
Ilya Dryomov5a237812018-02-06 19:26:34 +01002146static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2147 struct ceph_file_extent *img_extents,
2148 u32 num_img_extents,
2149 struct ceph_bio_iter *bio_pos)
2150{
2151 struct rbd_img_fill_ctx fctx = {
2152 .pos_type = OBJ_REQUEST_BIO,
2153 .pos = (union rbd_img_fill_iter *)bio_pos,
2154 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002155 .count_fn = count_bio_bvecs,
2156 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002157 };
2158
2159 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2160 &fctx);
2161}
2162
2163static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2164 u64 off, u64 len, struct bio *bio)
2165{
2166 struct ceph_file_extent ex = { off, len };
2167 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2168
2169 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2170}
2171
2172static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2173{
2174 struct rbd_obj_request *obj_req =
2175 container_of(ex, struct rbd_obj_request, ex);
2176 struct ceph_bvec_iter *it = arg;
2177
2178 obj_req->bvec_pos = *it;
2179 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2180 ceph_bvec_iter_advance(it, bytes);
2181}
2182
Ilya Dryomovafb97882018-02-06 19:26:35 +01002183static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2184{
2185 struct rbd_obj_request *obj_req =
2186 container_of(ex, struct rbd_obj_request, ex);
2187 struct ceph_bvec_iter *it = arg;
2188
2189 ceph_bvec_iter_advance_step(it, bytes, ({
2190 obj_req->bvec_count++;
2191 }));
2192}
2193
2194static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2195{
2196 struct rbd_obj_request *obj_req =
2197 container_of(ex, struct rbd_obj_request, ex);
2198 struct ceph_bvec_iter *it = arg;
2199
2200 ceph_bvec_iter_advance_step(it, bytes, ({
2201 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2202 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2203 }));
2204}
2205
Ilya Dryomov5a237812018-02-06 19:26:34 +01002206static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2207 struct ceph_file_extent *img_extents,
2208 u32 num_img_extents,
2209 struct ceph_bvec_iter *bvec_pos)
2210{
2211 struct rbd_img_fill_ctx fctx = {
2212 .pos_type = OBJ_REQUEST_BVECS,
2213 .pos = (union rbd_img_fill_iter *)bvec_pos,
2214 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002215 .count_fn = count_bvecs,
2216 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002217 };
2218
2219 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2220 &fctx);
2221}
2222
2223static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2224 struct ceph_file_extent *img_extents,
2225 u32 num_img_extents,
2226 struct bio_vec *bvecs)
2227{
2228 struct ceph_bvec_iter it = {
2229 .bvecs = bvecs,
2230 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2231 num_img_extents) },
2232 };
2233
2234 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2235 &it);
2236}
2237
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002238static void rbd_img_request_submit(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002239{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002240 struct rbd_obj_request *obj_request;
2241
Alex Elder37206ee2013-02-20 17:32:08 -06002242 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002243
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002244 rbd_img_request_get(img_request);
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002245 for_each_obj_request(img_request, obj_request)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002246 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002247
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002248 rbd_img_request_put(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002249}
2250
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002251static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002252{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002253 struct rbd_img_request *img_req = obj_req->img_request;
2254 struct rbd_img_request *child_img_req;
2255 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002256
Ilya Dryomove93aca02018-02-06 19:26:35 +01002257 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2258 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002259 if (!child_img_req)
2260 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002261
Ilya Dryomove93aca02018-02-06 19:26:35 +01002262 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2263 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002264
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002265 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002266 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002267 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002268 ret = __rbd_img_fill_from_bio(child_img_req,
2269 obj_req->img_extents,
2270 obj_req->num_img_extents,
2271 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002272 break;
2273 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002274 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002275 ret = __rbd_img_fill_from_bvecs(child_img_req,
2276 obj_req->img_extents,
2277 obj_req->num_img_extents,
2278 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002279 break;
2280 default:
2281 rbd_assert(0);
2282 }
2283 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002284 ret = rbd_img_fill_from_bvecs(child_img_req,
2285 obj_req->img_extents,
2286 obj_req->num_img_extents,
2287 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002288 }
2289 if (ret) {
2290 rbd_img_request_put(child_img_req);
2291 return ret;
2292 }
2293
2294 rbd_img_request_submit(child_img_req);
2295 return 0;
2296}
2297
2298static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2299{
2300 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2301 int ret;
2302
2303 if (obj_req->result == -ENOENT &&
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002304 rbd_dev->parent_overlap && !obj_req->tried_parent) {
2305 /* reverse map this object extent onto the parent */
2306 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002307 if (ret) {
2308 obj_req->result = ret;
2309 return true;
2310 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002311
2312 if (obj_req->num_img_extents) {
2313 obj_req->tried_parent = true;
2314 ret = rbd_obj_read_from_parent(obj_req);
2315 if (ret) {
2316 obj_req->result = ret;
2317 return true;
2318 }
2319 return false;
2320 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002321 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002322
2323 /*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002324 * -ENOENT means a hole in the image -- zero-fill the entire
2325 * length of the request. A short read also implies zero-fill
2326 * to the end of the request. In both cases we update xferred
2327 * count to indicate the whole request was satisfied.
Alex Elder02c74fb2013-05-06 17:40:33 -05002328 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002329 if (obj_req->result == -ENOENT ||
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002330 (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002331 rbd_assert(!obj_req->xferred || !obj_req->result);
2332 rbd_obj_zero_range(obj_req, obj_req->xferred,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002333 obj_req->ex.oe_len - obj_req->xferred);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002334 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002335 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002336 }
2337
2338 return true;
2339}
2340
2341/*
2342 * copyup_bvecs pages are never highmem pages
2343 */
2344static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2345{
2346 struct ceph_bvec_iter it = {
2347 .bvecs = bvecs,
2348 .iter = { .bi_size = bytes },
2349 };
2350
2351 ceph_bvec_iter_advance_step(&it, bytes, ({
2352 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2353 bv.bv_len))
2354 return false;
2355 }));
2356 return true;
2357}
2358
2359static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2360{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002361 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
Chengguang Xufe943d52018-04-12 12:04:55 +08002362 int ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002363
2364 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2365 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2366 rbd_osd_req_destroy(obj_req->osd_req);
2367
2368 /*
2369 * Create a copyup request with the same number of OSD ops as
2370 * the original request. The original request was stat + op(s),
2371 * the new copyup request will be copyup + the same op(s).
2372 */
Ilya Dryomova162b302018-01-30 17:52:10 +01002373 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002374 if (!obj_req->osd_req)
2375 return -ENOMEM;
2376
Chengguang Xufe943d52018-04-12 12:04:55 +08002377 ret = osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
2378 "copyup");
2379 if (ret)
2380 return ret;
2381
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002382 /*
2383 * Only send non-zero copyup data to save some I/O and network
2384 * bandwidth -- zero copyup data is equivalent to the object not
2385 * existing.
2386 */
2387 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2388 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2389 bytes = 0;
2390 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002391 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
Ilya Dryomov0010f702018-05-04 16:57:30 +02002392 obj_req->copyup_bvecs,
2393 obj_req->copyup_bvec_count,
2394 bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002395
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002396 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002397 case OBJ_OP_WRITE:
2398 __rbd_obj_setup_write(obj_req, 1);
2399 break;
2400 case OBJ_OP_DISCARD:
2401 rbd_assert(!rbd_obj_is_entire(obj_req));
2402 __rbd_obj_setup_discard(obj_req, 1);
2403 break;
2404 default:
2405 rbd_assert(0);
2406 }
2407
2408 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002409 return 0;
2410}
2411
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002412static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2413{
2414 u32 i;
2415
2416 rbd_assert(!obj_req->copyup_bvecs);
2417 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2418 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2419 sizeof(*obj_req->copyup_bvecs),
2420 GFP_NOIO);
2421 if (!obj_req->copyup_bvecs)
2422 return -ENOMEM;
2423
2424 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2425 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2426
2427 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2428 if (!obj_req->copyup_bvecs[i].bv_page)
2429 return -ENOMEM;
2430
2431 obj_req->copyup_bvecs[i].bv_offset = 0;
2432 obj_req->copyup_bvecs[i].bv_len = len;
2433 obj_overlap -= len;
2434 }
2435
2436 rbd_assert(!obj_overlap);
2437 return 0;
2438}
2439
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002440static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2441{
2442 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002443 int ret;
2444
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002445 rbd_assert(obj_req->num_img_extents);
2446 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2447 rbd_dev->parent_overlap);
2448 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002449 /*
2450 * The overlap has become 0 (most likely because the
2451 * image has been flattened). Use rbd_obj_issue_copyup()
2452 * to re-submit the original write request -- the copyup
2453 * operation itself will be a no-op, since someone must
2454 * have populated the child object while we weren't
2455 * looking. Move to WRITE_FLAT state as we'll be done
2456 * with the operation once the null copyup completes.
2457 */
2458 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2459 return rbd_obj_issue_copyup(obj_req, 0);
2460 }
2461
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002462 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002463 if (ret)
2464 return ret;
2465
2466 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002467 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002468}
2469
2470static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2471{
2472 int ret;
2473
2474again:
2475 switch (obj_req->write_state) {
2476 case RBD_OBJ_WRITE_GUARD:
2477 rbd_assert(!obj_req->xferred);
2478 if (obj_req->result == -ENOENT) {
2479 /*
2480 * The target object doesn't exist. Read the data for
2481 * the entire target object up to the overlap point (if
2482 * any) from the parent, so we can use it for a copyup.
2483 */
2484 ret = rbd_obj_handle_write_guard(obj_req);
2485 if (ret) {
2486 obj_req->result = ret;
2487 return true;
2488 }
2489 return false;
2490 }
2491 /* fall through */
2492 case RBD_OBJ_WRITE_FLAT:
2493 if (!obj_req->result)
2494 /*
2495 * There is no such thing as a successful short
2496 * write -- indicate the whole request was satisfied.
2497 */
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002498 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002499 return true;
2500 case RBD_OBJ_WRITE_COPYUP:
2501 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2502 if (obj_req->result)
2503 goto again;
2504
2505 rbd_assert(obj_req->xferred);
2506 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2507 if (ret) {
2508 obj_req->result = ret;
Ilya Dryomov080ba122019-03-01 12:06:24 +01002509 obj_req->xferred = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002510 return true;
2511 }
2512 return false;
2513 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002514 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002515 }
2516}
2517
2518/*
2519 * Returns true if @obj_req is completed, or false otherwise.
2520 */
2521static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2522{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002523 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002524 case OBJ_OP_READ:
2525 return rbd_obj_handle_read(obj_req);
2526 case OBJ_OP_WRITE:
2527 return rbd_obj_handle_write(obj_req);
2528 case OBJ_OP_DISCARD:
2529 if (rbd_obj_handle_write(obj_req)) {
2530 /*
2531 * Hide -ENOENT from delete/truncate/zero -- discarding
2532 * a non-existent object is not a problem.
2533 */
2534 if (obj_req->result == -ENOENT) {
2535 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002536 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002537 }
2538 return true;
2539 }
2540 return false;
2541 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002542 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002543 }
2544}
2545
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002546static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2547{
2548 struct rbd_img_request *img_req = obj_req->img_request;
2549
2550 rbd_assert((!obj_req->result &&
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002551 obj_req->xferred == obj_req->ex.oe_len) ||
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002552 (obj_req->result < 0 && !obj_req->xferred));
2553 if (!obj_req->result) {
2554 img_req->xferred += obj_req->xferred;
Ilya Dryomov980917f2016-09-12 18:59:42 +02002555 return;
Alex Elder02c74fb2013-05-06 17:40:33 -05002556 }
2557
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002558 rbd_warn(img_req->rbd_dev,
2559 "%s at objno %llu %llu~%llu result %d xferred %llu",
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002560 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2561 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002562 obj_req->xferred);
2563 if (!img_req->result) {
2564 img_req->result = obj_req->result;
2565 img_req->xferred = 0;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002566 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06002567}
2568
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002569static void rbd_img_end_child_request(struct rbd_img_request *img_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002570{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002571 struct rbd_obj_request *obj_req = img_req->obj_request;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002572
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002573 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002574 rbd_assert((!img_req->result &&
2575 img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
2576 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002577
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002578 obj_req->result = img_req->result;
2579 obj_req->xferred = img_req->xferred;
2580 rbd_img_request_put(img_req);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002581}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002582
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002583static void rbd_img_end_request(struct rbd_img_request *img_req)
2584{
2585 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2586 rbd_assert((!img_req->result &&
2587 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2588 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002589
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002590 blk_mq_end_request(img_req->rq,
2591 errno_to_blk_status(img_req->result));
2592 rbd_img_request_put(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002593}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002594
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002595static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2596{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002597 struct rbd_img_request *img_req;
2598
2599again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002600 if (!__rbd_obj_handle_request(obj_req))
2601 return;
2602
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002603 img_req = obj_req->img_request;
2604 spin_lock(&img_req->completion_lock);
2605 rbd_obj_end_request(obj_req);
2606 rbd_assert(img_req->pending_count);
2607 if (--img_req->pending_count) {
2608 spin_unlock(&img_req->completion_lock);
2609 return;
2610 }
2611
2612 spin_unlock(&img_req->completion_lock);
2613 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2614 obj_req = img_req->obj_request;
2615 rbd_img_end_child_request(img_req);
2616 goto again;
2617 }
2618 rbd_img_end_request(img_req);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002619}
2620
Ilya Dryomoved95b212016-08-12 16:40:02 +02002621static const struct rbd_client_id rbd_empty_cid;
2622
2623static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2624 const struct rbd_client_id *rhs)
2625{
2626 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2627}
2628
2629static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2630{
2631 struct rbd_client_id cid;
2632
2633 mutex_lock(&rbd_dev->watch_mutex);
2634 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2635 cid.handle = rbd_dev->watch_cookie;
2636 mutex_unlock(&rbd_dev->watch_mutex);
2637 return cid;
2638}
2639
2640/*
2641 * lock_rwsem must be held for write
2642 */
2643static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2644 const struct rbd_client_id *cid)
2645{
2646 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2647 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2648 cid->gid, cid->handle);
2649 rbd_dev->owner_cid = *cid; /* struct */
2650}
2651
2652static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2653{
2654 mutex_lock(&rbd_dev->watch_mutex);
2655 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2656 mutex_unlock(&rbd_dev->watch_mutex);
2657}
2658
Florian Margaineedd8ca82017-12-13 16:43:59 +01002659static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2660{
2661 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2662
2663 strcpy(rbd_dev->lock_cookie, cookie);
2664 rbd_set_owner_cid(rbd_dev, &cid);
2665 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2666}
2667
Ilya Dryomoved95b212016-08-12 16:40:02 +02002668/*
2669 * lock_rwsem must be held for write
2670 */
2671static int rbd_lock(struct rbd_device *rbd_dev)
2672{
2673 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002674 char cookie[32];
2675 int ret;
2676
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002677 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2678 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002679
2680 format_lock_cookie(rbd_dev, cookie);
2681 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2682 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2683 RBD_LOCK_TAG, "", 0);
2684 if (ret)
2685 return ret;
2686
2687 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002688 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002689 return 0;
2690}
2691
2692/*
2693 * lock_rwsem must be held for write
2694 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002695static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002696{
2697 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002698 int ret;
2699
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002700 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2701 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002702
Ilya Dryomoved95b212016-08-12 16:40:02 +02002703 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002704 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002705 if (ret && ret != -ENOENT)
2706 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002707
Ilya Dryomovbbead742017-04-13 12:17:38 +02002708 /* treat errors as the image is unlocked */
2709 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002710 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002711 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2712 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002713}
2714
2715static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2716 enum rbd_notify_op notify_op,
2717 struct page ***preply_pages,
2718 size_t *preply_len)
2719{
2720 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2721 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07002722 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
2723 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002724 void *p = buf;
2725
2726 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2727
2728 /* encode *LockPayload NotifyMessage (op + ClientId) */
2729 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2730 ceph_encode_32(&p, notify_op);
2731 ceph_encode_64(&p, cid.gid);
2732 ceph_encode_64(&p, cid.handle);
2733
2734 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2735 &rbd_dev->header_oloc, buf, buf_size,
2736 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2737}
2738
2739static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2740 enum rbd_notify_op notify_op)
2741{
2742 struct page **reply_pages;
2743 size_t reply_len;
2744
2745 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2746 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2747}
2748
2749static void rbd_notify_acquired_lock(struct work_struct *work)
2750{
2751 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2752 acquired_lock_work);
2753
2754 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2755}
2756
2757static void rbd_notify_released_lock(struct work_struct *work)
2758{
2759 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2760 released_lock_work);
2761
2762 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2763}
2764
2765static int rbd_request_lock(struct rbd_device *rbd_dev)
2766{
2767 struct page **reply_pages;
2768 size_t reply_len;
2769 bool lock_owner_responded = false;
2770 int ret;
2771
2772 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2773
2774 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2775 &reply_pages, &reply_len);
2776 if (ret && ret != -ETIMEDOUT) {
2777 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2778 goto out;
2779 }
2780
2781 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2782 void *p = page_address(reply_pages[0]);
2783 void *const end = p + reply_len;
2784 u32 n;
2785
2786 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2787 while (n--) {
2788 u8 struct_v;
2789 u32 len;
2790
2791 ceph_decode_need(&p, end, 8 + 8, e_inval);
2792 p += 8 + 8; /* skip gid and cookie */
2793
2794 ceph_decode_32_safe(&p, end, len, e_inval);
2795 if (!len)
2796 continue;
2797
2798 if (lock_owner_responded) {
2799 rbd_warn(rbd_dev,
2800 "duplicate lock owners detected");
2801 ret = -EIO;
2802 goto out;
2803 }
2804
2805 lock_owner_responded = true;
2806 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2807 &struct_v, &len);
2808 if (ret) {
2809 rbd_warn(rbd_dev,
2810 "failed to decode ResponseMessage: %d",
2811 ret);
2812 goto e_inval;
2813 }
2814
2815 ret = ceph_decode_32(&p);
2816 }
2817 }
2818
2819 if (!lock_owner_responded) {
2820 rbd_warn(rbd_dev, "no lock owners detected");
2821 ret = -ETIMEDOUT;
2822 }
2823
2824out:
2825 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2826 return ret;
2827
2828e_inval:
2829 ret = -EINVAL;
2830 goto out;
2831}
2832
2833static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2834{
2835 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2836
2837 cancel_delayed_work(&rbd_dev->lock_dwork);
2838 if (wake_all)
2839 wake_up_all(&rbd_dev->lock_waitq);
2840 else
2841 wake_up(&rbd_dev->lock_waitq);
2842}
2843
2844static int get_lock_owner_info(struct rbd_device *rbd_dev,
2845 struct ceph_locker **lockers, u32 *num_lockers)
2846{
2847 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2848 u8 lock_type;
2849 char *lock_tag;
2850 int ret;
2851
2852 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2853
2854 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2855 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2856 &lock_type, &lock_tag, lockers, num_lockers);
2857 if (ret)
2858 return ret;
2859
2860 if (*num_lockers == 0) {
2861 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2862 goto out;
2863 }
2864
2865 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2866 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2867 lock_tag);
2868 ret = -EBUSY;
2869 goto out;
2870 }
2871
2872 if (lock_type == CEPH_CLS_LOCK_SHARED) {
2873 rbd_warn(rbd_dev, "shared lock type detected");
2874 ret = -EBUSY;
2875 goto out;
2876 }
2877
2878 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2879 strlen(RBD_LOCK_COOKIE_PREFIX))) {
2880 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2881 (*lockers)[0].id.cookie);
2882 ret = -EBUSY;
2883 goto out;
2884 }
2885
2886out:
2887 kfree(lock_tag);
2888 return ret;
2889}
2890
2891static int find_watcher(struct rbd_device *rbd_dev,
2892 const struct ceph_locker *locker)
2893{
2894 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2895 struct ceph_watch_item *watchers;
2896 u32 num_watchers;
2897 u64 cookie;
2898 int i;
2899 int ret;
2900
2901 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2902 &rbd_dev->header_oloc, &watchers,
2903 &num_watchers);
2904 if (ret)
2905 return ret;
2906
2907 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2908 for (i = 0; i < num_watchers; i++) {
2909 if (!memcmp(&watchers[i].addr, &locker->info.addr,
2910 sizeof(locker->info.addr)) &&
2911 watchers[i].cookie == cookie) {
2912 struct rbd_client_id cid = {
2913 .gid = le64_to_cpu(watchers[i].name.num),
2914 .handle = cookie,
2915 };
2916
2917 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
2918 rbd_dev, cid.gid, cid.handle);
2919 rbd_set_owner_cid(rbd_dev, &cid);
2920 ret = 1;
2921 goto out;
2922 }
2923 }
2924
2925 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
2926 ret = 0;
2927out:
2928 kfree(watchers);
2929 return ret;
2930}
2931
2932/*
2933 * lock_rwsem must be held for write
2934 */
2935static int rbd_try_lock(struct rbd_device *rbd_dev)
2936{
2937 struct ceph_client *client = rbd_dev->rbd_client->client;
2938 struct ceph_locker *lockers;
2939 u32 num_lockers;
2940 int ret;
2941
2942 for (;;) {
2943 ret = rbd_lock(rbd_dev);
2944 if (ret != -EBUSY)
2945 return ret;
2946
2947 /* determine if the current lock holder is still alive */
2948 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
2949 if (ret)
2950 return ret;
2951
2952 if (num_lockers == 0)
2953 goto again;
2954
2955 ret = find_watcher(rbd_dev, lockers);
2956 if (ret) {
2957 if (ret > 0)
2958 ret = 0; /* have to request lock */
2959 goto out;
2960 }
2961
2962 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
2963 ENTITY_NAME(lockers[0].id.name));
2964
2965 ret = ceph_monc_blacklist_add(&client->monc,
2966 &lockers[0].info.addr);
2967 if (ret) {
2968 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
2969 ENTITY_NAME(lockers[0].id.name), ret);
2970 goto out;
2971 }
2972
2973 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
2974 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2975 lockers[0].id.cookie,
2976 &lockers[0].id.name);
2977 if (ret && ret != -ENOENT)
2978 goto out;
2979
2980again:
2981 ceph_free_lockers(lockers, num_lockers);
2982 }
2983
2984out:
2985 ceph_free_lockers(lockers, num_lockers);
2986 return ret;
2987}
2988
2989/*
2990 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
2991 */
2992static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
2993 int *pret)
2994{
2995 enum rbd_lock_state lock_state;
2996
2997 down_read(&rbd_dev->lock_rwsem);
2998 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
2999 rbd_dev->lock_state);
3000 if (__rbd_is_lock_owner(rbd_dev)) {
3001 lock_state = rbd_dev->lock_state;
3002 up_read(&rbd_dev->lock_rwsem);
3003 return lock_state;
3004 }
3005
3006 up_read(&rbd_dev->lock_rwsem);
3007 down_write(&rbd_dev->lock_rwsem);
3008 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3009 rbd_dev->lock_state);
3010 if (!__rbd_is_lock_owner(rbd_dev)) {
3011 *pret = rbd_try_lock(rbd_dev);
3012 if (*pret)
3013 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3014 }
3015
3016 lock_state = rbd_dev->lock_state;
3017 up_write(&rbd_dev->lock_rwsem);
3018 return lock_state;
3019}
3020
3021static void rbd_acquire_lock(struct work_struct *work)
3022{
3023 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3024 struct rbd_device, lock_dwork);
3025 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003026 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003027
3028 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3029again:
3030 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3031 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3032 if (lock_state == RBD_LOCK_STATE_LOCKED)
3033 wake_requests(rbd_dev, true);
3034 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3035 rbd_dev, lock_state, ret);
3036 return;
3037 }
3038
3039 ret = rbd_request_lock(rbd_dev);
3040 if (ret == -ETIMEDOUT) {
3041 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003042 } else if (ret == -EROFS) {
3043 rbd_warn(rbd_dev, "peer will not release lock");
3044 /*
3045 * If this is rbd_add_acquire_lock(), we want to fail
3046 * immediately -- reuse BLACKLISTED flag. Otherwise we
3047 * want to block.
3048 */
3049 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3050 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3051 /* wake "rbd map --exclusive" process */
3052 wake_requests(rbd_dev, false);
3053 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003054 } else if (ret < 0) {
3055 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3056 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3057 RBD_RETRY_DELAY);
3058 } else {
3059 /*
3060 * lock owner acked, but resend if we don't see them
3061 * release the lock
3062 */
3063 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3064 rbd_dev);
3065 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3066 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3067 }
3068}
3069
3070/*
3071 * lock_rwsem must be held for write
3072 */
3073static bool rbd_release_lock(struct rbd_device *rbd_dev)
3074{
3075 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3076 rbd_dev->lock_state);
3077 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3078 return false;
3079
3080 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3081 downgrade_write(&rbd_dev->lock_rwsem);
3082 /*
3083 * Ensure that all in-flight IO is flushed.
3084 *
3085 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3086 * may be shared with other devices.
3087 */
3088 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3089 up_read(&rbd_dev->lock_rwsem);
3090
3091 down_write(&rbd_dev->lock_rwsem);
3092 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3093 rbd_dev->lock_state);
3094 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3095 return false;
3096
Ilya Dryomovbbead742017-04-13 12:17:38 +02003097 rbd_unlock(rbd_dev);
3098 /*
3099 * Give others a chance to grab the lock - we would re-acquire
3100 * almost immediately if we got new IO during ceph_osdc_sync()
3101 * otherwise. We need to ack our own notifications, so this
3102 * lock_dwork will be requeued from rbd_wait_state_locked()
3103 * after wake_requests() in rbd_handle_released_lock().
3104 */
3105 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003106 return true;
3107}
3108
3109static void rbd_release_lock_work(struct work_struct *work)
3110{
3111 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3112 unlock_work);
3113
3114 down_write(&rbd_dev->lock_rwsem);
3115 rbd_release_lock(rbd_dev);
3116 up_write(&rbd_dev->lock_rwsem);
3117}
3118
3119static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3120 void **p)
3121{
3122 struct rbd_client_id cid = { 0 };
3123
3124 if (struct_v >= 2) {
3125 cid.gid = ceph_decode_64(p);
3126 cid.handle = ceph_decode_64(p);
3127 }
3128
3129 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3130 cid.handle);
3131 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3132 down_write(&rbd_dev->lock_rwsem);
3133 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3134 /*
3135 * we already know that the remote client is
3136 * the owner
3137 */
3138 up_write(&rbd_dev->lock_rwsem);
3139 return;
3140 }
3141
3142 rbd_set_owner_cid(rbd_dev, &cid);
3143 downgrade_write(&rbd_dev->lock_rwsem);
3144 } else {
3145 down_read(&rbd_dev->lock_rwsem);
3146 }
3147
3148 if (!__rbd_is_lock_owner(rbd_dev))
3149 wake_requests(rbd_dev, false);
3150 up_read(&rbd_dev->lock_rwsem);
3151}
3152
3153static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3154 void **p)
3155{
3156 struct rbd_client_id cid = { 0 };
3157
3158 if (struct_v >= 2) {
3159 cid.gid = ceph_decode_64(p);
3160 cid.handle = ceph_decode_64(p);
3161 }
3162
3163 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3164 cid.handle);
3165 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3166 down_write(&rbd_dev->lock_rwsem);
3167 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3168 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3169 __func__, rbd_dev, cid.gid, cid.handle,
3170 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3171 up_write(&rbd_dev->lock_rwsem);
3172 return;
3173 }
3174
3175 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3176 downgrade_write(&rbd_dev->lock_rwsem);
3177 } else {
3178 down_read(&rbd_dev->lock_rwsem);
3179 }
3180
3181 if (!__rbd_is_lock_owner(rbd_dev))
3182 wake_requests(rbd_dev, false);
3183 up_read(&rbd_dev->lock_rwsem);
3184}
3185
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003186/*
3187 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3188 * ResponseMessage is needed.
3189 */
3190static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3191 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003192{
3193 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3194 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003195 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003196
3197 if (struct_v >= 2) {
3198 cid.gid = ceph_decode_64(p);
3199 cid.handle = ceph_decode_64(p);
3200 }
3201
3202 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3203 cid.handle);
3204 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003205 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003206
3207 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003208 if (__rbd_is_lock_owner(rbd_dev)) {
3209 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3210 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3211 goto out_unlock;
3212
3213 /*
3214 * encode ResponseMessage(0) so the peer can detect
3215 * a missing owner
3216 */
3217 result = 0;
3218
3219 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003220 if (!rbd_dev->opts->exclusive) {
3221 dout("%s rbd_dev %p queueing unlock_work\n",
3222 __func__, rbd_dev);
3223 queue_work(rbd_dev->task_wq,
3224 &rbd_dev->unlock_work);
3225 } else {
3226 /* refuse to release the lock */
3227 result = -EROFS;
3228 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003229 }
3230 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003231
3232out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003233 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003234 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003235}
3236
3237static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3238 u64 notify_id, u64 cookie, s32 *result)
3239{
3240 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07003241 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3242 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003243 int ret;
3244
3245 if (result) {
3246 void *p = buf;
3247
3248 /* encode ResponseMessage */
3249 ceph_start_encoding(&p, 1, 1,
3250 buf_size - CEPH_ENCODING_START_BLK_LEN);
3251 ceph_encode_32(&p, *result);
3252 } else {
3253 buf_size = 0;
3254 }
3255
3256 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3257 &rbd_dev->header_oloc, notify_id, cookie,
3258 buf, buf_size);
3259 if (ret)
3260 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3261}
3262
3263static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3264 u64 cookie)
3265{
3266 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3267 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3268}
3269
3270static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3271 u64 notify_id, u64 cookie, s32 result)
3272{
3273 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3274 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3275}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003276
3277static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3278 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003279{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003280 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003281 void *p = data;
3282 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003283 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003284 u32 len;
3285 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003286 int ret;
3287
Ilya Dryomoved95b212016-08-12 16:40:02 +02003288 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3289 __func__, rbd_dev, cookie, notify_id, data_len);
3290 if (data_len) {
3291 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3292 &struct_v, &len);
3293 if (ret) {
3294 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3295 ret);
3296 return;
3297 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003298
Ilya Dryomoved95b212016-08-12 16:40:02 +02003299 notify_op = ceph_decode_32(&p);
3300 } else {
3301 /* legacy notification for header updates */
3302 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3303 len = 0;
3304 }
Alex Elderb8d70032012-11-30 17:53:04 -06003305
Ilya Dryomoved95b212016-08-12 16:40:02 +02003306 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3307 switch (notify_op) {
3308 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3309 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3310 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3311 break;
3312 case RBD_NOTIFY_OP_RELEASED_LOCK:
3313 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3314 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3315 break;
3316 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003317 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3318 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003319 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003320 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003321 else
3322 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3323 break;
3324 case RBD_NOTIFY_OP_HEADER_UPDATE:
3325 ret = rbd_dev_refresh(rbd_dev);
3326 if (ret)
3327 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3328
3329 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3330 break;
3331 default:
3332 if (rbd_is_lock_owner(rbd_dev))
3333 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3334 cookie, -EOPNOTSUPP);
3335 else
3336 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3337 break;
3338 }
Alex Elderb8d70032012-11-30 17:53:04 -06003339}
3340
Ilya Dryomov99d16942016-08-12 16:11:41 +02003341static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3342
Ilya Dryomov922dab62016-05-26 01:15:02 +02003343static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003344{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003345 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003346
Ilya Dryomov922dab62016-05-26 01:15:02 +02003347 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003348
Ilya Dryomoved95b212016-08-12 16:40:02 +02003349 down_write(&rbd_dev->lock_rwsem);
3350 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3351 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003352
Ilya Dryomov99d16942016-08-12 16:11:41 +02003353 mutex_lock(&rbd_dev->watch_mutex);
3354 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3355 __rbd_unregister_watch(rbd_dev);
3356 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003357
Ilya Dryomov99d16942016-08-12 16:11:41 +02003358 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003359 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003360 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003361}
3362
3363/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003364 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003365 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003366static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003367{
3368 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003369 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003370
Ilya Dryomov922dab62016-05-26 01:15:02 +02003371 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003372 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003373
Ilya Dryomov922dab62016-05-26 01:15:02 +02003374 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3375 &rbd_dev->header_oloc, rbd_watch_cb,
3376 rbd_watch_errcb, rbd_dev);
3377 if (IS_ERR(handle))
3378 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003379
Ilya Dryomov922dab62016-05-26 01:15:02 +02003380 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003381 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003382}
3383
Ilya Dryomov99d16942016-08-12 16:11:41 +02003384/*
3385 * watch_mutex must be locked
3386 */
3387static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003388{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003389 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3390 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003391
Ilya Dryomov99d16942016-08-12 16:11:41 +02003392 rbd_assert(rbd_dev->watch_handle);
3393 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003394
Ilya Dryomov922dab62016-05-26 01:15:02 +02003395 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3396 if (ret)
3397 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003398
Ilya Dryomov922dab62016-05-26 01:15:02 +02003399 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003400}
3401
Ilya Dryomov99d16942016-08-12 16:11:41 +02003402static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003403{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003404 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003405
Ilya Dryomov99d16942016-08-12 16:11:41 +02003406 mutex_lock(&rbd_dev->watch_mutex);
3407 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3408 ret = __rbd_register_watch(rbd_dev);
3409 if (ret)
3410 goto out;
3411
3412 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3413 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3414
3415out:
3416 mutex_unlock(&rbd_dev->watch_mutex);
3417 return ret;
3418}
3419
3420static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3421{
3422 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3423
Ilya Dryomoved95b212016-08-12 16:40:02 +02003424 cancel_work_sync(&rbd_dev->acquired_lock_work);
3425 cancel_work_sync(&rbd_dev->released_lock_work);
3426 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3427 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003428}
3429
3430static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3431{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003432 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003433 cancel_tasks_sync(rbd_dev);
3434
3435 mutex_lock(&rbd_dev->watch_mutex);
3436 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3437 __rbd_unregister_watch(rbd_dev);
3438 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3439 mutex_unlock(&rbd_dev->watch_mutex);
3440
Dongsheng Yang23edca82018-06-04 06:24:37 -04003441 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomov811c6682016-04-15 16:22:16 +02003442 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003443}
3444
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003445/*
3446 * lock_rwsem must be held for write
3447 */
3448static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3449{
3450 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3451 char cookie[32];
3452 int ret;
3453
3454 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3455
3456 format_lock_cookie(rbd_dev, cookie);
3457 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3458 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3459 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3460 RBD_LOCK_TAG, cookie);
3461 if (ret) {
3462 if (ret != -EOPNOTSUPP)
3463 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3464 ret);
3465
3466 /*
3467 * Lock cookie cannot be updated on older OSDs, so do
3468 * a manual release and queue an acquire.
3469 */
3470 if (rbd_release_lock(rbd_dev))
3471 queue_delayed_work(rbd_dev->task_wq,
3472 &rbd_dev->lock_dwork, 0);
3473 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003474 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003475 }
3476}
3477
Ilya Dryomov99d16942016-08-12 16:11:41 +02003478static void rbd_reregister_watch(struct work_struct *work)
3479{
3480 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3481 struct rbd_device, watch_dwork);
3482 int ret;
3483
3484 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3485
3486 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003487 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3488 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003489 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003490 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003491
3492 ret = __rbd_register_watch(rbd_dev);
3493 if (ret) {
3494 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003495 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003496 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003497 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003498 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003499 queue_delayed_work(rbd_dev->task_wq,
3500 &rbd_dev->watch_dwork,
3501 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003502 }
3503 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003504 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003505 }
3506
3507 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3508 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3509 mutex_unlock(&rbd_dev->watch_mutex);
3510
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003511 down_write(&rbd_dev->lock_rwsem);
3512 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3513 rbd_reacquire_lock(rbd_dev);
3514 up_write(&rbd_dev->lock_rwsem);
3515
Ilya Dryomov99d16942016-08-12 16:11:41 +02003516 ret = rbd_dev_refresh(rbd_dev);
3517 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00003518 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003519}
3520
Alex Elder36be9a72013-01-19 00:30:28 -06003521/*
Alex Elderf40eb342013-04-25 15:09:42 -05003522 * Synchronous osd object method call. Returns the number of bytes
3523 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003524 */
3525static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003526 struct ceph_object_id *oid,
3527 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003528 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003529 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003530 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003531 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003532 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003533{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003534 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3535 struct page *req_page = NULL;
3536 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003537 int ret;
3538
3539 /*
Alex Elder6010a452013-04-05 01:27:11 -05003540 * Method calls are ultimately read operations. The result
3541 * should placed into the inbound buffer provided. They
3542 * also supply outbound data--parameters for the object
3543 * method. Currently if this is present it will be a
3544 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003545 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003546 if (outbound) {
3547 if (outbound_size > PAGE_SIZE)
3548 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003549
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003550 req_page = alloc_page(GFP_KERNEL);
3551 if (!req_page)
3552 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003553
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003554 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003555 }
Alex Elder430c28c2013-04-03 21:32:51 -05003556
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003557 reply_page = alloc_page(GFP_KERNEL);
3558 if (!reply_page) {
3559 if (req_page)
3560 __free_page(req_page);
3561 return -ENOMEM;
3562 }
Alex Elder36be9a72013-01-19 00:30:28 -06003563
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003564 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3565 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3566 reply_page, &inbound_size);
3567 if (!ret) {
3568 memcpy(inbound, page_address(reply_page), inbound_size);
3569 ret = inbound_size;
3570 }
Alex Elder57385b52013-04-21 12:14:45 -05003571
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003572 if (req_page)
3573 __free_page(req_page);
3574 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003575 return ret;
3576}
3577
Ilya Dryomoved95b212016-08-12 16:40:02 +02003578/*
3579 * lock_rwsem must be held for read
3580 */
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003581static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003582{
3583 DEFINE_WAIT(wait);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003584 unsigned long timeout;
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003585 int ret = 0;
3586
3587 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3588 return -EBLACKLISTED;
3589
3590 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3591 return 0;
3592
3593 if (!may_acquire) {
3594 rbd_warn(rbd_dev, "exclusive lock required");
3595 return -EROFS;
3596 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003597
3598 do {
3599 /*
3600 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3601 * and cancel_delayed_work() in wake_requests().
3602 */
3603 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3604 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3605 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3606 TASK_UNINTERRUPTIBLE);
3607 up_read(&rbd_dev->lock_rwsem);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003608 timeout = schedule_timeout(ceph_timeout_jiffies(
3609 rbd_dev->opts->lock_timeout));
Ilya Dryomoved95b212016-08-12 16:40:02 +02003610 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003611 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3612 ret = -EBLACKLISTED;
3613 break;
3614 }
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003615 if (!timeout) {
3616 rbd_warn(rbd_dev, "timed out waiting for lock");
3617 ret = -ETIMEDOUT;
3618 break;
3619 }
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003620 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003621
Ilya Dryomoved95b212016-08-12 16:40:02 +02003622 finish_wait(&rbd_dev->lock_waitq, &wait);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003623 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003624}
3625
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003626static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003627{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003628 struct request *rq = blk_mq_rq_from_pdu(work);
3629 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003630 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003631 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003632 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3633 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003634 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003635 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003636 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003637 int result;
3638
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003639 switch (req_op(rq)) {
3640 case REQ_OP_DISCARD:
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003641 case REQ_OP_WRITE_ZEROES:
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003642 op_type = OBJ_OP_DISCARD;
3643 break;
3644 case REQ_OP_WRITE:
3645 op_type = OBJ_OP_WRITE;
3646 break;
3647 case REQ_OP_READ:
3648 op_type = OBJ_OP_READ;
3649 break;
3650 default:
3651 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003652 result = -EIO;
3653 goto err;
3654 }
3655
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003656 /* Ignore/skip any zero-length requests */
3657
3658 if (!length) {
3659 dout("%s: zero-length request\n", __func__);
3660 result = 0;
3661 goto err_rq;
3662 }
3663
Ilya Dryomov9568c932017-10-12 12:35:19 +02003664 rbd_assert(op_type == OBJ_OP_READ ||
3665 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003666
3667 /*
3668 * Quit early if the mapped snapshot no longer exists. It's
3669 * still possible the snapshot will have disappeared by the
3670 * time our request arrives at the osd, but there's no sense in
3671 * sending it if we already know.
3672 */
3673 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3674 dout("request for non-existent snapshot");
3675 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3676 result = -ENXIO;
3677 goto err_rq;
3678 }
3679
3680 if (offset && length > U64_MAX - offset + 1) {
3681 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3682 length);
3683 result = -EINVAL;
3684 goto err_rq; /* Shouldn't happen */
3685 }
3686
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003687 blk_mq_start_request(rq);
3688
Josh Durgin4e752f02014-04-08 11:12:11 -07003689 down_read(&rbd_dev->header_rwsem);
3690 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003691 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003692 snapc = rbd_dev->header.snapc;
3693 ceph_get_snap_context(snapc);
3694 }
3695 up_read(&rbd_dev->header_rwsem);
3696
3697 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003698 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003699 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003700 result = -EIO;
3701 goto err_rq;
3702 }
3703
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003704 must_be_locked =
3705 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3706 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003707 if (must_be_locked) {
3708 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003709 result = rbd_wait_state_locked(rbd_dev,
3710 !rbd_dev->opts->exclusive);
3711 if (result)
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003712 goto err_unlock;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003713 }
3714
Ilya Dryomovdfd98752018-02-06 19:26:35 +01003715 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003716 if (!img_request) {
3717 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003718 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003719 }
3720 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003721 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003722
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003723 if (op_type == OBJ_OP_DISCARD)
Ilya Dryomov5a237812018-02-06 19:26:34 +01003724 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003725 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01003726 result = rbd_img_fill_from_bio(img_request, offset, length,
3727 rq->bio);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003728 if (result)
3729 goto err_img_request;
3730
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01003731 rbd_img_request_submit(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003732 if (must_be_locked)
3733 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003734 return;
3735
3736err_img_request:
3737 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003738err_unlock:
3739 if (must_be_locked)
3740 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003741err_rq:
3742 if (result)
3743 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003744 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003745 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003746err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003747 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003748}
3749
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003750static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003751 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003752{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003753 struct request *rq = bd->rq;
3754 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003755
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003756 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003757 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003758}
3759
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003760static void rbd_free_disk(struct rbd_device *rbd_dev)
3761{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003762 blk_cleanup_queue(rbd_dev->disk->queue);
3763 blk_mq_free_tag_set(&rbd_dev->tag_set);
3764 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003765 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003766}
3767
Alex Elder788e2df2013-01-17 12:25:27 -06003768static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003769 struct ceph_object_id *oid,
3770 struct ceph_object_locator *oloc,
3771 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003772
3773{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003774 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3775 struct ceph_osd_request *req;
3776 struct page **pages;
3777 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003778 int ret;
3779
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003780 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3781 if (!req)
3782 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003783
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003784 ceph_oid_copy(&req->r_base_oid, oid);
3785 ceph_oloc_copy(&req->r_base_oloc, oloc);
3786 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003787
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003788 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06003789 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003790 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06003791
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003792 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3793 if (IS_ERR(pages)) {
3794 ret = PTR_ERR(pages);
3795 goto out_req;
3796 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003797
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003798 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3799 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3800 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003801
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003802 ceph_osdc_start_request(osdc, req, false);
3803 ret = ceph_osdc_wait_request(osdc, req);
3804 if (ret >= 0)
3805 ceph_copy_from_page_vector(pages, buf, 0, ret);
3806
3807out_req:
3808 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003809 return ret;
3810}
3811
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003812/*
Alex Elder662518b2013-05-06 09:51:29 -05003813 * Read the complete header for the given rbd device. On successful
3814 * return, the rbd_dev->header field will contain up-to-date
3815 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003816 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003817static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003818{
3819 struct rbd_image_header_ondisk *ondisk = NULL;
3820 u32 snap_count = 0;
3821 u64 names_size = 0;
3822 u32 want_count;
3823 int ret;
3824
3825 /*
3826 * The complete header will include an array of its 64-bit
3827 * snapshot ids, followed by the names of those snapshots as
3828 * a contiguous block of NUL-terminated strings. Note that
3829 * the number of snapshots could change by the time we read
3830 * it in, in which case we re-read it.
3831 */
3832 do {
3833 size_t size;
3834
3835 kfree(ondisk);
3836
3837 size = sizeof (*ondisk);
3838 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3839 size += names_size;
3840 ondisk = kmalloc(size, GFP_KERNEL);
3841 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003842 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003843
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003844 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3845 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05003846 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003847 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003848 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003849 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003850 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3851 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003852 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003853 }
3854 if (!rbd_dev_ondisk_valid(ondisk)) {
3855 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003856 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003857 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003858 }
3859
3860 names_size = le64_to_cpu(ondisk->snap_names_len);
3861 want_count = snap_count;
3862 snap_count = le32_to_cpu(ondisk->snap_count);
3863 } while (snap_count != want_count);
3864
Alex Elder662518b2013-05-06 09:51:29 -05003865 ret = rbd_header_from_disk(rbd_dev, ondisk);
3866out:
Alex Elder4156d992012-08-02 11:29:46 -05003867 kfree(ondisk);
3868
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003869 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003870}
3871
Alex Elder15228ed2013-05-01 12:43:03 -05003872/*
3873 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3874 * has disappeared from the (just updated) snapshot context.
3875 */
3876static void rbd_exists_validate(struct rbd_device *rbd_dev)
3877{
3878 u64 snap_id;
3879
3880 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3881 return;
3882
3883 snap_id = rbd_dev->spec->snap_id;
3884 if (snap_id == CEPH_NOSNAP)
3885 return;
3886
3887 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3888 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3889}
3890
Josh Durgin98752012013-08-29 17:26:31 -07003891static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3892{
3893 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07003894
3895 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02003896 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3897 * try to update its size. If REMOVING is set, updating size
3898 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07003899 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02003900 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3901 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07003902 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3903 dout("setting size to %llu sectors", (unsigned long long)size);
3904 set_capacity(rbd_dev->disk, size);
3905 revalidate_disk(rbd_dev->disk);
3906 }
3907}
3908
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003909static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003910{
Alex Eldere627db02013-05-06 07:40:30 -05003911 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003912 int ret;
3913
Alex Eldercfbf6372013-05-31 17:40:45 -05003914 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05003915 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04003916
3917 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003918 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003919 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05003920
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003921 /*
3922 * If there is a parent, see if it has disappeared due to the
3923 * mapped image getting flattened.
3924 */
3925 if (rbd_dev->parent) {
3926 ret = rbd_dev_v2_parent_info(rbd_dev);
3927 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003928 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003929 }
3930
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003931 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003932 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003933 } else {
3934 /* validate mapped snapshot's EXISTS flag */
3935 rbd_exists_validate(rbd_dev);
3936 }
Alex Elder15228ed2013-05-01 12:43:03 -05003937
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003938out:
Alex Eldercfbf6372013-05-31 17:40:45 -05003939 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003940 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07003941 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003942
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003943 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05003944}
3945
Christoph Hellwigd6296d32017-05-01 10:19:08 -06003946static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
3947 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003948{
3949 struct work_struct *work = blk_mq_rq_to_pdu(rq);
3950
3951 INIT_WORK(work, rbd_queue_workfn);
3952 return 0;
3953}
3954
Eric Biggersf363b082017-03-30 13:39:16 -07003955static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003956 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003957 .init_request = rbd_init_request,
3958};
3959
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003960static int rbd_init_disk(struct rbd_device *rbd_dev)
3961{
3962 struct gendisk *disk;
3963 struct request_queue *q;
Ilya Dryomov420efbd2018-04-16 09:32:18 +02003964 unsigned int objset_bytes =
3965 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003966 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003967
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003968 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003969 disk = alloc_disk(single_major ?
3970 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3971 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003972 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003973 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003974
Alex Elderf0f8cef2012-01-29 13:57:44 -06003975 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003976 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003977 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003978 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003979 if (single_major)
3980 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003981 disk->fops = &rbd_bd_ops;
3982 disk->private_data = rbd_dev;
3983
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003984 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
3985 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003986 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003987 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003988 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003989 rbd_dev->tag_set.nr_hw_queues = 1;
3990 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
3991
3992 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
3993 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003994 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003995
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003996 q = blk_mq_init_queue(&rbd_dev->tag_set);
3997 if (IS_ERR(q)) {
3998 err = PTR_ERR(q);
3999 goto out_tag_set;
4000 }
4001
Bart Van Assche8b904b52018-03-07 17:10:10 -08004002 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004003 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004004
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004005 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004006 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01004007 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01004008 blk_queue_max_segment_size(q, UINT_MAX);
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004009 blk_queue_io_min(q, objset_bytes);
4010 blk_queue_io_opt(q, objset_bytes);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004011
Ilya Dryomovd9360542018-03-23 06:14:47 +01004012 if (rbd_dev->opts->trim) {
4013 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
4014 q->limits.discard_granularity = objset_bytes;
4015 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4016 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4017 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004018
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004019 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01004020 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004021
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004022 /*
4023 * disk_release() expects a queue ref from add_disk() and will
4024 * put it. Hold an extra ref until add_disk() is called.
4025 */
4026 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004027 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004028 q->queuedata = rbd_dev;
4029
4030 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004031
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004032 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004033out_tag_set:
4034 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004035out_disk:
4036 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004037 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004038}
4039
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004040/*
4041 sysfs
4042*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004043
Alex Elder593a9e72012-02-07 12:03:37 -06004044static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4045{
4046 return container_of(dev, struct rbd_device, dev);
4047}
4048
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004049static ssize_t rbd_size_show(struct device *dev,
4050 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004051{
Alex Elder593a9e72012-02-07 12:03:37 -06004052 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004053
Alex Elderfc71d832013-04-26 15:44:36 -05004054 return sprintf(buf, "%llu\n",
4055 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004056}
4057
Alex Elder34b13182012-07-13 20:35:12 -05004058/*
4059 * Note this shows the features for whatever's mapped, which is not
4060 * necessarily the base image.
4061 */
4062static ssize_t rbd_features_show(struct device *dev,
4063 struct device_attribute *attr, char *buf)
4064{
4065 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4066
4067 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004068 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004069}
4070
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004071static ssize_t rbd_major_show(struct device *dev,
4072 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004073{
Alex Elder593a9e72012-02-07 12:03:37 -06004074 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004075
Alex Elderfc71d832013-04-26 15:44:36 -05004076 if (rbd_dev->major)
4077 return sprintf(buf, "%d\n", rbd_dev->major);
4078
4079 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004080}
Alex Elderfc71d832013-04-26 15:44:36 -05004081
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004082static ssize_t rbd_minor_show(struct device *dev,
4083 struct device_attribute *attr, char *buf)
4084{
4085 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4086
4087 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004088}
4089
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004090static ssize_t rbd_client_addr_show(struct device *dev,
4091 struct device_attribute *attr, char *buf)
4092{
4093 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4094 struct ceph_entity_addr *client_addr =
4095 ceph_client_addr(rbd_dev->rbd_client->client);
4096
4097 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4098 le32_to_cpu(client_addr->nonce));
4099}
4100
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004101static ssize_t rbd_client_id_show(struct device *dev,
4102 struct device_attribute *attr, char *buf)
4103{
Alex Elder593a9e72012-02-07 12:03:37 -06004104 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004105
Alex Elder1dbb4392012-01-24 10:08:37 -06004106 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004107 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004108}
4109
Mike Christie267fb902016-08-18 18:38:43 +02004110static ssize_t rbd_cluster_fsid_show(struct device *dev,
4111 struct device_attribute *attr, char *buf)
4112{
4113 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4114
4115 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4116}
4117
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004118static ssize_t rbd_config_info_show(struct device *dev,
4119 struct device_attribute *attr, char *buf)
4120{
4121 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4122
4123 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004124}
4125
4126static ssize_t rbd_pool_show(struct device *dev,
4127 struct device_attribute *attr, char *buf)
4128{
Alex Elder593a9e72012-02-07 12:03:37 -06004129 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004130
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004131 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004132}
4133
Alex Elder9bb2f332012-07-12 10:46:35 -05004134static ssize_t rbd_pool_id_show(struct device *dev,
4135 struct device_attribute *attr, char *buf)
4136{
4137 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4138
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004139 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004140 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004141}
4142
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004143static ssize_t rbd_pool_ns_show(struct device *dev,
4144 struct device_attribute *attr, char *buf)
4145{
4146 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4147
4148 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4149}
4150
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004151static ssize_t rbd_name_show(struct device *dev,
4152 struct device_attribute *attr, char *buf)
4153{
Alex Elder593a9e72012-02-07 12:03:37 -06004154 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004155
Alex Eldera92ffdf2012-10-30 19:40:33 -05004156 if (rbd_dev->spec->image_name)
4157 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4158
4159 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004160}
4161
Alex Elder589d30e2012-07-10 20:30:11 -05004162static ssize_t rbd_image_id_show(struct device *dev,
4163 struct device_attribute *attr, char *buf)
4164{
4165 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4166
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004167 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004168}
4169
Alex Elder34b13182012-07-13 20:35:12 -05004170/*
4171 * Shows the name of the currently-mapped snapshot (or
4172 * RBD_SNAP_HEAD_NAME for the base image).
4173 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004174static ssize_t rbd_snap_show(struct device *dev,
4175 struct device_attribute *attr,
4176 char *buf)
4177{
Alex Elder593a9e72012-02-07 12:03:37 -06004178 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004179
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004180 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004181}
4182
Mike Christie92a58672016-08-18 18:38:44 +02004183static ssize_t rbd_snap_id_show(struct device *dev,
4184 struct device_attribute *attr, char *buf)
4185{
4186 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4187
4188 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4189}
4190
Alex Elder86b00e02012-10-25 23:34:42 -05004191/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004192 * For a v2 image, shows the chain of parent images, separated by empty
4193 * lines. For v1 images or if there is no parent, shows "(no parent
4194 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004195 */
4196static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004197 struct device_attribute *attr,
4198 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004199{
4200 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004201 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004202
Ilya Dryomovff961282014-07-22 21:53:07 +04004203 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004204 return sprintf(buf, "(no parent image)\n");
4205
Ilya Dryomovff961282014-07-22 21:53:07 +04004206 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4207 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004208
Ilya Dryomovff961282014-07-22 21:53:07 +04004209 count += sprintf(&buf[count], "%s"
4210 "pool_id %llu\npool_name %s\n"
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004211 "pool_ns %s\n"
Ilya Dryomovff961282014-07-22 21:53:07 +04004212 "image_id %s\nimage_name %s\n"
4213 "snap_id %llu\nsnap_name %s\n"
4214 "overlap %llu\n",
4215 !count ? "" : "\n", /* first? */
4216 spec->pool_id, spec->pool_name,
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004217 spec->pool_ns ?: "",
Ilya Dryomovff961282014-07-22 21:53:07 +04004218 spec->image_id, spec->image_name ?: "(unknown)",
4219 spec->snap_id, spec->snap_name,
4220 rbd_dev->parent_overlap);
4221 }
Alex Elder86b00e02012-10-25 23:34:42 -05004222
Ilya Dryomovff961282014-07-22 21:53:07 +04004223 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004224}
4225
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004226static ssize_t rbd_image_refresh(struct device *dev,
4227 struct device_attribute *attr,
4228 const char *buf,
4229 size_t size)
4230{
Alex Elder593a9e72012-02-07 12:03:37 -06004231 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004232 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004233
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004234 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004235 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004236 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004237
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004238 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004239}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004240
Joe Perches5657a812018-05-24 13:38:59 -06004241static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
4242static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
4243static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
4244static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
4245static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
4246static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
4247static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
4248static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
4249static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
4250static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004251static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
Joe Perches5657a812018-05-24 13:38:59 -06004252static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
4253static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
4254static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
4255static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
4256static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
4257static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004258
4259static struct attribute *rbd_attrs[] = {
4260 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004261 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004262 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004263 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004264 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004265 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004266 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004267 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004268 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004269 &dev_attr_pool_id.attr,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004270 &dev_attr_pool_ns.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004271 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004272 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004273 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004274 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004275 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004276 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004277 NULL
4278};
4279
4280static struct attribute_group rbd_attr_group = {
4281 .attrs = rbd_attrs,
4282};
4283
4284static const struct attribute_group *rbd_attr_groups[] = {
4285 &rbd_attr_group,
4286 NULL
4287};
4288
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004289static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004290
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304291static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004292 .name = "rbd",
4293 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004294 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004295};
4296
Alex Elder8b8fb992012-10-26 17:25:24 -05004297static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4298{
4299 kref_get(&spec->kref);
4300
4301 return spec;
4302}
4303
4304static void rbd_spec_free(struct kref *kref);
4305static void rbd_spec_put(struct rbd_spec *spec)
4306{
4307 if (spec)
4308 kref_put(&spec->kref, rbd_spec_free);
4309}
4310
4311static struct rbd_spec *rbd_spec_alloc(void)
4312{
4313 struct rbd_spec *spec;
4314
4315 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4316 if (!spec)
4317 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004318
4319 spec->pool_id = CEPH_NOPOOL;
4320 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004321 kref_init(&spec->kref);
4322
Alex Elder8b8fb992012-10-26 17:25:24 -05004323 return spec;
4324}
4325
4326static void rbd_spec_free(struct kref *kref)
4327{
4328 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4329
4330 kfree(spec->pool_name);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004331 kfree(spec->pool_ns);
Alex Elder8b8fb992012-10-26 17:25:24 -05004332 kfree(spec->image_id);
4333 kfree(spec->image_name);
4334 kfree(spec->snap_name);
4335 kfree(spec);
4336}
4337
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004338static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004339{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004340 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004341 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004342
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004343 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004344 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004345 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004346
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004347 rbd_put_client(rbd_dev->rbd_client);
4348 rbd_spec_put(rbd_dev->spec);
4349 kfree(rbd_dev->opts);
4350 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004351}
4352
4353static void rbd_dev_release(struct device *dev)
4354{
4355 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4356 bool need_put = !!rbd_dev->opts;
4357
4358 if (need_put) {
4359 destroy_workqueue(rbd_dev->task_wq);
4360 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4361 }
4362
4363 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004364
4365 /*
4366 * This is racy, but way better than putting module outside of
4367 * the release callback. The race window is pretty small, so
4368 * doing something similar to dm (dm-builtin.c) is overkill.
4369 */
4370 if (need_put)
4371 module_put(THIS_MODULE);
4372}
4373
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004374static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4375 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004376{
4377 struct rbd_device *rbd_dev;
4378
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004379 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004380 if (!rbd_dev)
4381 return NULL;
4382
4383 spin_lock_init(&rbd_dev->lock);
4384 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004385 init_rwsem(&rbd_dev->header_rwsem);
4386
Ilya Dryomov7e973322017-01-25 18:16:22 +01004387 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004388 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004389 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004390 if (spec->pool_ns) {
4391 WARN_ON(!*spec->pool_ns);
4392 rbd_dev->header_oloc.pool_ns =
4393 ceph_find_or_create_string(spec->pool_ns,
4394 strlen(spec->pool_ns));
4395 }
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004396
Ilya Dryomov99d16942016-08-12 16:11:41 +02004397 mutex_init(&rbd_dev->watch_mutex);
4398 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4399 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4400
Ilya Dryomoved95b212016-08-12 16:40:02 +02004401 init_rwsem(&rbd_dev->lock_rwsem);
4402 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4403 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4404 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4405 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4406 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4407 init_waitqueue_head(&rbd_dev->lock_waitq);
4408
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004409 rbd_dev->dev.bus = &rbd_bus_type;
4410 rbd_dev->dev.type = &rbd_device_type;
4411 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004412 device_initialize(&rbd_dev->dev);
4413
Alex Elderc53d5892012-10-25 23:34:42 -05004414 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004415 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004416
Alex Elderc53d5892012-10-25 23:34:42 -05004417 return rbd_dev;
4418}
4419
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004420/*
4421 * Create a mapping rbd_dev.
4422 */
4423static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4424 struct rbd_spec *spec,
4425 struct rbd_options *opts)
4426{
4427 struct rbd_device *rbd_dev;
4428
4429 rbd_dev = __rbd_dev_create(rbdc, spec);
4430 if (!rbd_dev)
4431 return NULL;
4432
4433 rbd_dev->opts = opts;
4434
4435 /* get an id and fill in device name */
4436 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4437 minor_to_rbd_dev_id(1 << MINORBITS),
4438 GFP_KERNEL);
4439 if (rbd_dev->dev_id < 0)
4440 goto fail_rbd_dev;
4441
4442 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4443 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4444 rbd_dev->name);
4445 if (!rbd_dev->task_wq)
4446 goto fail_dev_id;
4447
4448 /* we have a ref from do_rbd_add() */
4449 __module_get(THIS_MODULE);
4450
4451 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4452 return rbd_dev;
4453
4454fail_dev_id:
4455 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4456fail_rbd_dev:
4457 rbd_dev_free(rbd_dev);
4458 return NULL;
4459}
4460
Alex Elderc53d5892012-10-25 23:34:42 -05004461static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4462{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004463 if (rbd_dev)
4464 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004465}
4466
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004467/*
Alex Elder9d475de2012-07-03 16:01:19 -05004468 * Get the size and object order for an image snapshot, or if
4469 * snap_id is CEPH_NOSNAP, gets this information for the base
4470 * image.
4471 */
4472static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4473 u8 *order, u64 *snap_size)
4474{
4475 __le64 snapid = cpu_to_le64(snap_id);
4476 int ret;
4477 struct {
4478 u8 order;
4479 __le64 size;
4480 } __attribute__ ((packed)) size_buf = { 0 };
4481
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004482 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4483 &rbd_dev->header_oloc, "get_size",
4484 &snapid, sizeof(snapid),
4485 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004486 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004487 if (ret < 0)
4488 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004489 if (ret < sizeof (size_buf))
4490 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004491
Josh Durginc3545572013-08-28 17:08:10 -07004492 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004493 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004494 dout(" order %u", (unsigned int)*order);
4495 }
Alex Elder9d475de2012-07-03 16:01:19 -05004496 *snap_size = le64_to_cpu(size_buf.size);
4497
Josh Durginc3545572013-08-28 17:08:10 -07004498 dout(" snap_id 0x%016llx snap_size = %llu\n",
4499 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004500 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004501
4502 return 0;
4503}
4504
4505static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4506{
4507 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4508 &rbd_dev->header.obj_order,
4509 &rbd_dev->header.image_size);
4510}
4511
Alex Elder1e130192012-07-03 16:01:19 -05004512static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4513{
4514 void *reply_buf;
4515 int ret;
4516 void *p;
4517
4518 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4519 if (!reply_buf)
4520 return -ENOMEM;
4521
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004522 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4523 &rbd_dev->header_oloc, "get_object_prefix",
4524 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004525 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004526 if (ret < 0)
4527 goto out;
4528
4529 p = reply_buf;
4530 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004531 p + ret, NULL, GFP_NOIO);
4532 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004533
4534 if (IS_ERR(rbd_dev->header.object_prefix)) {
4535 ret = PTR_ERR(rbd_dev->header.object_prefix);
4536 rbd_dev->header.object_prefix = NULL;
4537 } else {
4538 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4539 }
Alex Elder1e130192012-07-03 16:01:19 -05004540out:
4541 kfree(reply_buf);
4542
4543 return ret;
4544}
4545
Alex Elderb1b54022012-07-03 16:01:19 -05004546static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4547 u64 *snap_features)
4548{
4549 __le64 snapid = cpu_to_le64(snap_id);
4550 struct {
4551 __le64 features;
4552 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004553 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004554 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004555 int ret;
4556
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004557 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4558 &rbd_dev->header_oloc, "get_features",
4559 &snapid, sizeof(snapid),
4560 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004561 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004562 if (ret < 0)
4563 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004564 if (ret < sizeof (features_buf))
4565 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004566
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004567 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4568 if (unsup) {
4569 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4570 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004571 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004572 }
Alex Elderd8891402012-10-09 13:50:17 -07004573
Alex Elderb1b54022012-07-03 16:01:19 -05004574 *snap_features = le64_to_cpu(features_buf.features);
4575
4576 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004577 (unsigned long long)snap_id,
4578 (unsigned long long)*snap_features,
4579 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004580
4581 return 0;
4582}
4583
4584static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4585{
4586 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4587 &rbd_dev->header.features);
4588}
4589
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004590struct parent_image_info {
4591 u64 pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004592 const char *pool_ns;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004593 const char *image_id;
4594 u64 snap_id;
4595
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004596 bool has_overlap;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004597 u64 overlap;
4598};
4599
4600/*
4601 * The caller is responsible for @pii.
4602 */
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004603static int decode_parent_image_spec(void **p, void *end,
4604 struct parent_image_info *pii)
4605{
4606 u8 struct_v;
4607 u32 struct_len;
4608 int ret;
4609
4610 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
4611 &struct_v, &struct_len);
4612 if (ret)
4613 return ret;
4614
4615 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
4616 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4617 if (IS_ERR(pii->pool_ns)) {
4618 ret = PTR_ERR(pii->pool_ns);
4619 pii->pool_ns = NULL;
4620 return ret;
4621 }
4622 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4623 if (IS_ERR(pii->image_id)) {
4624 ret = PTR_ERR(pii->image_id);
4625 pii->image_id = NULL;
4626 return ret;
4627 }
4628 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
4629 return 0;
4630
4631e_inval:
4632 return -EINVAL;
4633}
4634
4635static int __get_parent_info(struct rbd_device *rbd_dev,
4636 struct page *req_page,
4637 struct page *reply_page,
4638 struct parent_image_info *pii)
4639{
4640 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4641 size_t reply_len = PAGE_SIZE;
4642 void *p, *end;
4643 int ret;
4644
4645 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4646 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4647 req_page, sizeof(u64), reply_page, &reply_len);
4648 if (ret)
4649 return ret == -EOPNOTSUPP ? 1 : ret;
4650
4651 p = page_address(reply_page);
4652 end = p + reply_len;
4653 ret = decode_parent_image_spec(&p, end, pii);
4654 if (ret)
4655 return ret;
4656
4657 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4658 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4659 req_page, sizeof(u64), reply_page, &reply_len);
4660 if (ret)
4661 return ret;
4662
4663 p = page_address(reply_page);
4664 end = p + reply_len;
4665 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
4666 if (pii->has_overlap)
4667 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4668
4669 return 0;
4670
4671e_inval:
4672 return -EINVAL;
4673}
4674
4675/*
4676 * The caller is responsible for @pii.
4677 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004678static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4679 struct page *req_page,
4680 struct page *reply_page,
4681 struct parent_image_info *pii)
4682{
4683 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4684 size_t reply_len = PAGE_SIZE;
4685 void *p, *end;
4686 int ret;
4687
4688 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4689 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4690 req_page, sizeof(u64), reply_page, &reply_len);
4691 if (ret)
4692 return ret;
4693
4694 p = page_address(reply_page);
4695 end = p + reply_len;
4696 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4697 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4698 if (IS_ERR(pii->image_id)) {
4699 ret = PTR_ERR(pii->image_id);
4700 pii->image_id = NULL;
4701 return ret;
4702 }
4703 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004704 pii->has_overlap = true;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004705 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4706
4707 return 0;
4708
4709e_inval:
4710 return -EINVAL;
4711}
4712
4713static int get_parent_info(struct rbd_device *rbd_dev,
4714 struct parent_image_info *pii)
4715{
4716 struct page *req_page, *reply_page;
4717 void *p;
4718 int ret;
4719
4720 req_page = alloc_page(GFP_KERNEL);
4721 if (!req_page)
4722 return -ENOMEM;
4723
4724 reply_page = alloc_page(GFP_KERNEL);
4725 if (!reply_page) {
4726 __free_page(req_page);
4727 return -ENOMEM;
4728 }
4729
4730 p = page_address(req_page);
4731 ceph_encode_64(&p, rbd_dev->spec->snap_id);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004732 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
4733 if (ret > 0)
4734 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
4735 pii);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004736
4737 __free_page(req_page);
4738 __free_page(reply_page);
4739 return ret;
4740}
4741
Alex Elder86b00e02012-10-25 23:34:42 -05004742static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4743{
4744 struct rbd_spec *parent_spec;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004745 struct parent_image_info pii = { 0 };
Alex Elder86b00e02012-10-25 23:34:42 -05004746 int ret;
4747
4748 parent_spec = rbd_spec_alloc();
4749 if (!parent_spec)
4750 return -ENOMEM;
4751
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004752 ret = get_parent_info(rbd_dev, &pii);
4753 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004754 goto out_err;
4755
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004756 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
4757 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
4758 pii.has_overlap, pii.overlap);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004759
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004760 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
Alex Elder392a9da2013-05-06 17:40:33 -05004761 /*
4762 * Either the parent never existed, or we have
4763 * record of it but the image got flattened so it no
4764 * longer has a parent. When the parent of a
4765 * layered image disappears we immediately set the
4766 * overlap to 0. The effect of this is that all new
4767 * requests will be treated as if the image had no
4768 * parent.
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004769 *
4770 * If !pii.has_overlap, the parent image spec is not
4771 * applicable. It's there to avoid duplication in each
4772 * snapshot record.
Alex Elder392a9da2013-05-06 17:40:33 -05004773 */
4774 if (rbd_dev->parent_overlap) {
4775 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004776 rbd_dev_parent_put(rbd_dev);
4777 pr_info("%s: clone image has been flattened\n",
4778 rbd_dev->disk->disk_name);
4779 }
4780
Alex Elder86b00e02012-10-25 23:34:42 -05004781 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004782 }
Alex Elder86b00e02012-10-25 23:34:42 -05004783
Alex Elder0903e872012-11-14 12:25:19 -06004784 /* The ceph file layout needs to fit pool id in 32 bits */
4785
4786 ret = -EIO;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004787 if (pii.pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004788 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004789 (unsigned long long)pii.pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004790 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004791 }
Alex Elder0903e872012-11-14 12:25:19 -06004792
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004793 /*
4794 * The parent won't change (except when the clone is
4795 * flattened, already handled that). So we only need to
4796 * record the parent spec we have not already done so.
4797 */
4798 if (!rbd_dev->parent_spec) {
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004799 parent_spec->pool_id = pii.pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004800 if (pii.pool_ns && *pii.pool_ns) {
4801 parent_spec->pool_ns = pii.pool_ns;
4802 pii.pool_ns = NULL;
4803 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004804 parent_spec->image_id = pii.image_id;
4805 pii.image_id = NULL;
4806 parent_spec->snap_id = pii.snap_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004807
Alex Elder70cf49c2013-05-06 17:40:33 -05004808 rbd_dev->parent_spec = parent_spec;
4809 parent_spec = NULL; /* rbd_dev now owns this */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004810 }
4811
4812 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004813 * We always update the parent overlap. If it's zero we issue
4814 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004815 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004816 if (!pii.overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004817 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004818 /* refresh, careful to warn just once */
4819 if (rbd_dev->parent_overlap)
4820 rbd_warn(rbd_dev,
4821 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004822 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004823 /* initial probe */
4824 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004825 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004826 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004827 rbd_dev->parent_overlap = pii.overlap;
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004828
Alex Elder86b00e02012-10-25 23:34:42 -05004829out:
4830 ret = 0;
4831out_err:
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004832 kfree(pii.pool_ns);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004833 kfree(pii.image_id);
Alex Elder86b00e02012-10-25 23:34:42 -05004834 rbd_spec_put(parent_spec);
Alex Elder86b00e02012-10-25 23:34:42 -05004835 return ret;
4836}
4837
Alex Eldercc070d52013-04-21 12:14:45 -05004838static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4839{
4840 struct {
4841 __le64 stripe_unit;
4842 __le64 stripe_count;
4843 } __attribute__ ((packed)) striping_info_buf = { 0 };
4844 size_t size = sizeof (striping_info_buf);
4845 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05004846 int ret;
4847
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004848 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4849 &rbd_dev->header_oloc, "get_stripe_unit_count",
4850 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004851 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4852 if (ret < 0)
4853 return ret;
4854 if (ret < size)
4855 return -ERANGE;
4856
Alex Eldercc070d52013-04-21 12:14:45 -05004857 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01004858 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
4859 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05004860 return 0;
4861}
4862
Ilya Dryomov7e973322017-01-25 18:16:22 +01004863static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4864{
4865 __le64 data_pool_id;
4866 int ret;
4867
4868 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4869 &rbd_dev->header_oloc, "get_data_pool",
4870 NULL, 0, &data_pool_id, sizeof(data_pool_id));
4871 if (ret < 0)
4872 return ret;
4873 if (ret < sizeof(data_pool_id))
4874 return -EBADMSG;
4875
4876 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4877 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4878 return 0;
4879}
4880
Alex Elder9e15b772012-10-30 19:40:33 -05004881static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4882{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004883 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05004884 size_t image_id_size;
4885 char *image_id;
4886 void *p;
4887 void *end;
4888 size_t size;
4889 void *reply_buf = NULL;
4890 size_t len = 0;
4891 char *image_name = NULL;
4892 int ret;
4893
4894 rbd_assert(!rbd_dev->spec->image_name);
4895
Alex Elder69e7a022012-11-01 08:39:26 -05004896 len = strlen(rbd_dev->spec->image_id);
4897 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004898 image_id = kmalloc(image_id_size, GFP_KERNEL);
4899 if (!image_id)
4900 return NULL;
4901
4902 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004903 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004904 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004905
4906 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4907 reply_buf = kmalloc(size, GFP_KERNEL);
4908 if (!reply_buf)
4909 goto out;
4910
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004911 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4912 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
4913 "dir_get_name", image_id, image_id_size,
4914 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05004915 if (ret < 0)
4916 goto out;
4917 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004918 end = reply_buf + ret;
4919
Alex Elder9e15b772012-10-30 19:40:33 -05004920 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4921 if (IS_ERR(image_name))
4922 image_name = NULL;
4923 else
4924 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4925out:
4926 kfree(reply_buf);
4927 kfree(image_id);
4928
4929 return image_name;
4930}
4931
Alex Elder2ad3d712013-04-30 00:44:33 -05004932static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4933{
4934 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4935 const char *snap_name;
4936 u32 which = 0;
4937
4938 /* Skip over names until we find the one we are looking for */
4939
4940 snap_name = rbd_dev->header.snap_names;
4941 while (which < snapc->num_snaps) {
4942 if (!strcmp(name, snap_name))
4943 return snapc->snaps[which];
4944 snap_name += strlen(snap_name) + 1;
4945 which++;
4946 }
4947 return CEPH_NOSNAP;
4948}
4949
4950static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4951{
4952 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4953 u32 which;
4954 bool found = false;
4955 u64 snap_id;
4956
4957 for (which = 0; !found && which < snapc->num_snaps; which++) {
4958 const char *snap_name;
4959
4960 snap_id = snapc->snaps[which];
4961 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07004962 if (IS_ERR(snap_name)) {
4963 /* ignore no-longer existing snapshots */
4964 if (PTR_ERR(snap_name) == -ENOENT)
4965 continue;
4966 else
4967 break;
4968 }
Alex Elder2ad3d712013-04-30 00:44:33 -05004969 found = !strcmp(name, snap_name);
4970 kfree(snap_name);
4971 }
4972 return found ? snap_id : CEPH_NOSNAP;
4973}
4974
4975/*
4976 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4977 * no snapshot by that name is found, or if an error occurs.
4978 */
4979static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4980{
4981 if (rbd_dev->image_format == 1)
4982 return rbd_v1_snap_id_by_name(rbd_dev, name);
4983
4984 return rbd_v2_snap_id_by_name(rbd_dev, name);
4985}
4986
Alex Elder9e15b772012-10-30 19:40:33 -05004987/*
Ilya Dryomov04077592014-07-23 17:11:20 +04004988 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05004989 */
Ilya Dryomov04077592014-07-23 17:11:20 +04004990static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
4991{
4992 struct rbd_spec *spec = rbd_dev->spec;
4993
4994 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
4995 rbd_assert(spec->image_id && spec->image_name);
4996 rbd_assert(spec->snap_name);
4997
4998 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4999 u64 snap_id;
5000
5001 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5002 if (snap_id == CEPH_NOSNAP)
5003 return -ENOENT;
5004
5005 spec->snap_id = snap_id;
5006 } else {
5007 spec->snap_id = CEPH_NOSNAP;
5008 }
5009
5010 return 0;
5011}
5012
5013/*
5014 * A parent image will have all ids but none of the names.
5015 *
5016 * All names in an rbd spec are dynamically allocated. It's OK if we
5017 * can't figure out the name for an image id.
5018 */
5019static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005020{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005021 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5022 struct rbd_spec *spec = rbd_dev->spec;
5023 const char *pool_name;
5024 const char *image_name;
5025 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005026 int ret;
5027
Ilya Dryomov04077592014-07-23 17:11:20 +04005028 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5029 rbd_assert(spec->image_id);
5030 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005031
Alex Elder2e9f7f12013-04-26 09:43:48 -05005032 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005033
Alex Elder2e9f7f12013-04-26 09:43:48 -05005034 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5035 if (!pool_name) {
5036 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005037 return -EIO;
5038 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005039 pool_name = kstrdup(pool_name, GFP_KERNEL);
5040 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005041 return -ENOMEM;
5042
5043 /* Fetch the image name; tolerate failure here */
5044
Alex Elder2e9f7f12013-04-26 09:43:48 -05005045 image_name = rbd_dev_image_name(rbd_dev);
5046 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005047 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005048
Ilya Dryomov04077592014-07-23 17:11:20 +04005049 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005050
Alex Elder2e9f7f12013-04-26 09:43:48 -05005051 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005052 if (IS_ERR(snap_name)) {
5053 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005054 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005055 }
5056
5057 spec->pool_name = pool_name;
5058 spec->image_name = image_name;
5059 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005060
5061 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005062
Alex Elder9e15b772012-10-30 19:40:33 -05005063out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005064 kfree(image_name);
5065 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005066 return ret;
5067}
5068
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005069static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005070{
5071 size_t size;
5072 int ret;
5073 void *reply_buf;
5074 void *p;
5075 void *end;
5076 u64 seq;
5077 u32 snap_count;
5078 struct ceph_snap_context *snapc;
5079 u32 i;
5080
5081 /*
5082 * We'll need room for the seq value (maximum snapshot id),
5083 * snapshot count, and array of that many snapshot ids.
5084 * For now we have a fixed upper limit on the number we're
5085 * prepared to receive.
5086 */
5087 size = sizeof (__le64) + sizeof (__le32) +
5088 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5089 reply_buf = kzalloc(size, GFP_KERNEL);
5090 if (!reply_buf)
5091 return -ENOMEM;
5092
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005093 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5094 &rbd_dev->header_oloc, "get_snapcontext",
5095 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005096 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005097 if (ret < 0)
5098 goto out;
5099
Alex Elder35d489f2012-07-03 16:01:19 -05005100 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005101 end = reply_buf + ret;
5102 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005103 ceph_decode_64_safe(&p, end, seq, out);
5104 ceph_decode_32_safe(&p, end, snap_count, out);
5105
5106 /*
5107 * Make sure the reported number of snapshot ids wouldn't go
5108 * beyond the end of our buffer. But before checking that,
5109 * make sure the computed size of the snapshot context we
5110 * allocate is representable in a size_t.
5111 */
5112 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5113 / sizeof (u64)) {
5114 ret = -EINVAL;
5115 goto out;
5116 }
5117 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5118 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005119 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005120
Alex Elder812164f82013-04-30 00:44:32 -05005121 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005122 if (!snapc) {
5123 ret = -ENOMEM;
5124 goto out;
5125 }
Alex Elder35d489f2012-07-03 16:01:19 -05005126 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005127 for (i = 0; i < snap_count; i++)
5128 snapc->snaps[i] = ceph_decode_64(&p);
5129
Alex Elder49ece552013-05-06 08:37:00 -05005130 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005131 rbd_dev->header.snapc = snapc;
5132
5133 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005134 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005135out:
5136 kfree(reply_buf);
5137
Alex Elder57385b52013-04-21 12:14:45 -05005138 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005139}
5140
Alex Elder54cac612013-04-30 00:44:33 -05005141static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5142 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005143{
5144 size_t size;
5145 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005146 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005147 int ret;
5148 void *p;
5149 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005150 char *snap_name;
5151
5152 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5153 reply_buf = kmalloc(size, GFP_KERNEL);
5154 if (!reply_buf)
5155 return ERR_PTR(-ENOMEM);
5156
Alex Elder54cac612013-04-30 00:44:33 -05005157 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005158 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5159 &rbd_dev->header_oloc, "get_snapshot_name",
5160 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005161 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005162 if (ret < 0) {
5163 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005164 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005165 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005166
5167 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005168 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005169 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005170 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005171 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005172
Alex Elderf40eb342013-04-25 15:09:42 -05005173 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005174 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005175out:
5176 kfree(reply_buf);
5177
Alex Elderf40eb342013-04-25 15:09:42 -05005178 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005179}
5180
Alex Elder2df3fac2013-05-06 09:51:30 -05005181static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005182{
Alex Elder2df3fac2013-05-06 09:51:30 -05005183 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005184 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005185
Josh Durgin1617e402013-06-12 14:43:10 -07005186 ret = rbd_dev_v2_image_size(rbd_dev);
5187 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005188 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005189
Alex Elder2df3fac2013-05-06 09:51:30 -05005190 if (first_time) {
5191 ret = rbd_dev_v2_header_onetime(rbd_dev);
5192 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005193 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005194 }
5195
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005196 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005197 if (ret && first_time) {
5198 kfree(rbd_dev->header.object_prefix);
5199 rbd_dev->header.object_prefix = NULL;
5200 }
Alex Elder117973f2012-08-31 17:29:55 -05005201
5202 return ret;
5203}
5204
Ilya Dryomova720ae02014-07-23 17:11:19 +04005205static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5206{
5207 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5208
5209 if (rbd_dev->image_format == 1)
5210 return rbd_dev_v1_header_info(rbd_dev);
5211
5212 return rbd_dev_v2_header_info(rbd_dev);
5213}
5214
Alex Elder1ddbe942012-01-29 13:57:44 -06005215/*
Alex Eldere28fff262012-02-02 08:13:30 -06005216 * Skips over white space at *buf, and updates *buf to point to the
5217 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005218 * the token (string of non-white space characters) found. Note
5219 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005220 */
5221static inline size_t next_token(const char **buf)
5222{
5223 /*
5224 * These are the characters that produce nonzero for
5225 * isspace() in the "C" and "POSIX" locales.
5226 */
5227 const char *spaces = " \f\n\r\t\v";
5228
5229 *buf += strspn(*buf, spaces); /* Find start of token */
5230
5231 return strcspn(*buf, spaces); /* Return token length */
5232}
5233
5234/*
Alex Elderea3352f2012-07-09 21:04:23 -05005235 * Finds the next token in *buf, dynamically allocates a buffer big
5236 * enough to hold a copy of it, and copies the token into the new
5237 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5238 * that a duplicate buffer is created even for a zero-length token.
5239 *
5240 * Returns a pointer to the newly-allocated duplicate, or a null
5241 * pointer if memory for the duplicate was not available. If
5242 * the lenp argument is a non-null pointer, the length of the token
5243 * (not including the '\0') is returned in *lenp.
5244 *
5245 * If successful, the *buf pointer will be updated to point beyond
5246 * the end of the found token.
5247 *
5248 * Note: uses GFP_KERNEL for allocation.
5249 */
5250static inline char *dup_token(const char **buf, size_t *lenp)
5251{
5252 char *dup;
5253 size_t len;
5254
5255 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005256 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005257 if (!dup)
5258 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005259 *(dup + len) = '\0';
5260 *buf += len;
5261
5262 if (lenp)
5263 *lenp = len;
5264
5265 return dup;
5266}
5267
5268/*
Alex Elder859c31d2012-10-25 23:34:42 -05005269 * Parse the options provided for an "rbd add" (i.e., rbd image
5270 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5271 * and the data written is passed here via a NUL-terminated buffer.
5272 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005273 *
Alex Elder859c31d2012-10-25 23:34:42 -05005274 * The information extracted from these options is recorded in
5275 * the other parameters which return dynamically-allocated
5276 * structures:
5277 * ceph_opts
5278 * The address of a pointer that will refer to a ceph options
5279 * structure. Caller must release the returned pointer using
5280 * ceph_destroy_options() when it is no longer needed.
5281 * rbd_opts
5282 * Address of an rbd options pointer. Fully initialized by
5283 * this function; caller must release with kfree().
5284 * spec
5285 * Address of an rbd image specification pointer. Fully
5286 * initialized by this function based on parsed options.
5287 * Caller must release with rbd_spec_put().
5288 *
5289 * The options passed take this form:
5290 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5291 * where:
5292 * <mon_addrs>
5293 * A comma-separated list of one or more monitor addresses.
5294 * A monitor address is an ip address, optionally followed
5295 * by a port number (separated by a colon).
5296 * I.e.: ip1[:port1][,ip2[:port2]...]
5297 * <options>
5298 * A comma-separated list of ceph and/or rbd options.
5299 * <pool_name>
5300 * The name of the rados pool containing the rbd image.
5301 * <image_name>
5302 * The name of the image in that pool to map.
5303 * <snap_id>
5304 * An optional snapshot id. If provided, the mapping will
5305 * present data from the image at the time that snapshot was
5306 * created. The image head is used if no snapshot id is
5307 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005308 */
Alex Elder859c31d2012-10-25 23:34:42 -05005309static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005310 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005311 struct rbd_options **opts,
5312 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005313{
Alex Elderd22f76e2012-07-12 10:46:35 -05005314 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005315 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005316 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005317 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005318 size_t mon_addrs_size;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005319 struct parse_rbd_opts_ctx pctx = { 0 };
Alex Elder859c31d2012-10-25 23:34:42 -05005320 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005321 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005322
5323 /* The first four tokens are required */
5324
Alex Elder7ef32142012-02-02 08:13:30 -06005325 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005326 if (!len) {
5327 rbd_warn(NULL, "no monitor address(es) provided");
5328 return -EINVAL;
5329 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005330 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005331 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005332 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005333
Alex Elderdc79b112012-10-25 23:34:41 -05005334 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005335 options = dup_token(&buf, NULL);
5336 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005337 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005338 if (!*options) {
5339 rbd_warn(NULL, "no options provided");
5340 goto out_err;
5341 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005342
Ilya Dryomovc3001562018-07-03 15:28:43 +02005343 pctx.spec = rbd_spec_alloc();
5344 if (!pctx.spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005345 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005346
Ilya Dryomovc3001562018-07-03 15:28:43 +02005347 pctx.spec->pool_name = dup_token(&buf, NULL);
5348 if (!pctx.spec->pool_name)
Alex Elder859c31d2012-10-25 23:34:42 -05005349 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005350 if (!*pctx.spec->pool_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005351 rbd_warn(NULL, "no pool name provided");
5352 goto out_err;
5353 }
Alex Eldere28fff262012-02-02 08:13:30 -06005354
Ilya Dryomovc3001562018-07-03 15:28:43 +02005355 pctx.spec->image_name = dup_token(&buf, NULL);
5356 if (!pctx.spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005357 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005358 if (!*pctx.spec->image_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005359 rbd_warn(NULL, "no image name provided");
5360 goto out_err;
5361 }
Alex Eldere28fff262012-02-02 08:13:30 -06005362
Alex Elderf28e5652012-10-25 23:34:41 -05005363 /*
5364 * Snapshot name is optional; default is to use "-"
5365 * (indicating the head/no snapshot).
5366 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005367 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005368 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005369 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5370 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005371 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005372 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005373 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005374 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005375 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5376 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005377 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005378 *(snap_name + len) = '\0';
Ilya Dryomovc3001562018-07-03 15:28:43 +02005379 pctx.spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005380
Alex Elder0ddebc02012-10-25 23:34:41 -05005381 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005382
Ilya Dryomovc3001562018-07-03 15:28:43 +02005383 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5384 if (!pctx.opts)
Alex Elder4e9afeb2012-10-25 23:34:41 -05005385 goto out_mem;
5386
Ilya Dryomovc3001562018-07-03 15:28:43 +02005387 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5388 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
5389 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5390 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5391 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5392 pctx.opts->trim = RBD_TRIM_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005393
Alex Elder859c31d2012-10-25 23:34:42 -05005394 copts = ceph_parse_options(options, mon_addrs,
Ilya Dryomovc3001562018-07-03 15:28:43 +02005395 mon_addrs + mon_addrs_size - 1,
5396 parse_rbd_opts_token, &pctx);
Alex Elder859c31d2012-10-25 23:34:42 -05005397 if (IS_ERR(copts)) {
5398 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005399 goto out_err;
5400 }
Alex Elder859c31d2012-10-25 23:34:42 -05005401 kfree(options);
5402
5403 *ceph_opts = copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005404 *opts = pctx.opts;
5405 *rbd_spec = pctx.spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005406
Alex Elderdc79b112012-10-25 23:34:41 -05005407 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005408out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005409 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005410out_err:
Ilya Dryomovc3001562018-07-03 15:28:43 +02005411 kfree(pctx.opts);
5412 rbd_spec_put(pctx.spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005413 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005414
Alex Elderdc79b112012-10-25 23:34:41 -05005415 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005416}
5417
Ilya Dryomove010dd02017-04-13 12:17:39 +02005418static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5419{
5420 down_write(&rbd_dev->lock_rwsem);
5421 if (__rbd_is_lock_owner(rbd_dev))
5422 rbd_unlock(rbd_dev);
5423 up_write(&rbd_dev->lock_rwsem);
5424}
5425
5426static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5427{
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005428 int ret;
5429
Ilya Dryomove010dd02017-04-13 12:17:39 +02005430 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5431 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5432 return -EINVAL;
5433 }
5434
5435 /* FIXME: "rbd map --exclusive" should be in interruptible */
5436 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005437 ret = rbd_wait_state_locked(rbd_dev, true);
Ilya Dryomove010dd02017-04-13 12:17:39 +02005438 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005439 if (ret) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02005440 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5441 return -EROFS;
5442 }
5443
5444 return 0;
5445}
5446
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005447/*
Alex Elder589d30e2012-07-10 20:30:11 -05005448 * An rbd format 2 image has a unique identifier, distinct from the
5449 * name given to it by the user. Internally, that identifier is
5450 * what's used to specify the names of objects related to the image.
5451 *
5452 * A special "rbd id" object is used to map an rbd image name to its
5453 * id. If that object doesn't exist, then there is no v2 rbd image
5454 * with the supplied name.
5455 *
5456 * This function will record the given rbd_dev's image_id field if
5457 * it can be determined, and in that case will return 0. If any
5458 * errors occur a negative errno will be returned and the rbd_dev's
5459 * image_id field will be unchanged (and should be NULL).
5460 */
5461static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5462{
5463 int ret;
5464 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005465 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005466 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005467 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005468
Alex Elder589d30e2012-07-10 20:30:11 -05005469 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005470 * When probing a parent image, the image id is already
5471 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005472 * need to fetch the image id again in this case. We
5473 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005474 */
Alex Elderc0fba362013-04-25 23:15:08 -05005475 if (rbd_dev->spec->image_id) {
5476 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5477
Alex Elder2c0d0a12012-10-30 19:40:33 -05005478 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005479 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005480
5481 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005482 * First, see if the format 2 image id file exists, and if
5483 * so, get the image's persistent id from it.
5484 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005485 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5486 rbd_dev->spec->image_name);
5487 if (ret)
5488 return ret;
5489
5490 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005491
5492 /* Response will be an encoded string, which includes a length */
5493
5494 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5495 response = kzalloc(size, GFP_NOIO);
5496 if (!response) {
5497 ret = -ENOMEM;
5498 goto out;
5499 }
5500
Alex Elderc0fba362013-04-25 23:15:08 -05005501 /* If it doesn't exist we'll assume it's a format 1 image */
5502
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005503 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5504 "get_id", NULL, 0,
5505 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005506 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005507 if (ret == -ENOENT) {
5508 image_id = kstrdup("", GFP_KERNEL);
5509 ret = image_id ? 0 : -ENOMEM;
5510 if (!ret)
5511 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005512 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005513 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005514
Alex Elderc0fba362013-04-25 23:15:08 -05005515 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005516 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005517 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005518 if (!ret)
5519 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005520 }
5521
5522 if (!ret) {
5523 rbd_dev->spec->image_id = image_id;
5524 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005525 }
5526out:
5527 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005528 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005529 return ret;
5530}
5531
Alex Elder3abef3b2013-05-13 20:35:37 -05005532/*
5533 * Undo whatever state changes are made by v1 or v2 header info
5534 * call.
5535 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005536static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5537{
5538 struct rbd_image_header *header;
5539
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005540 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005541
5542 /* Free dynamic fields from the header, then zero it out */
5543
5544 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005545 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005546 kfree(header->snap_sizes);
5547 kfree(header->snap_names);
5548 kfree(header->object_prefix);
5549 memset(header, 0, sizeof (*header));
5550}
5551
Alex Elder2df3fac2013-05-06 09:51:30 -05005552static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005553{
5554 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005555
Alex Elder1e130192012-07-03 16:01:19 -05005556 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005557 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005558 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005559
Alex Elder2df3fac2013-05-06 09:51:30 -05005560 /*
5561 * Get the and check features for the image. Currently the
5562 * features are assumed to never change.
5563 */
Alex Elderb1b54022012-07-03 16:01:19 -05005564 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005565 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005566 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005567
Alex Eldercc070d52013-04-21 12:14:45 -05005568 /* If the image supports fancy striping, get its parameters */
5569
5570 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5571 ret = rbd_dev_v2_striping_info(rbd_dev);
5572 if (ret < 0)
5573 goto out_err;
5574 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005575
Ilya Dryomov7e973322017-01-25 18:16:22 +01005576 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5577 ret = rbd_dev_v2_data_pool(rbd_dev);
5578 if (ret)
5579 goto out_err;
5580 }
5581
Ilya Dryomov263423f2017-01-25 18:16:22 +01005582 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005583 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005584
Alex Elder9d475de2012-07-03 16:01:19 -05005585out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005586 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005587 kfree(rbd_dev->header.object_prefix);
5588 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005589 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005590}
5591
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005592/*
5593 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5594 * rbd_dev_image_probe() recursion depth, which means it's also the
5595 * length of the already discovered part of the parent chain.
5596 */
5597static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005598{
Alex Elder2f82ee52012-10-30 19:40:33 -05005599 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005600 int ret;
5601
5602 if (!rbd_dev->parent_spec)
5603 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005604
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005605 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5606 pr_info("parent chain is too long (%d)\n", depth);
5607 ret = -EINVAL;
5608 goto out_err;
5609 }
5610
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005611 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005612 if (!parent) {
5613 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005614 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005615 }
5616
5617 /*
5618 * Images related by parent/child relationships always share
5619 * rbd_client and spec/parent_spec, so bump their refcounts.
5620 */
5621 __rbd_get_client(rbd_dev->rbd_client);
5622 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005623
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005624 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005625 if (ret < 0)
5626 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005627
Alex Elder124afba2013-04-26 15:44:36 -05005628 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005629 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005630 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005631
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005632out_err:
5633 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005634 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005635 return ret;
5636}
5637
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005638static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5639{
5640 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5641 rbd_dev_mapping_clear(rbd_dev);
5642 rbd_free_disk(rbd_dev);
5643 if (!single_major)
5644 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5645}
5646
Ilya Dryomov811c6682016-04-15 16:22:16 +02005647/*
5648 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5649 * upon return.
5650 */
Alex Elder200a6a82013-04-28 23:32:34 -05005651static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005652{
Alex Elder83a06262012-10-30 15:47:17 -05005653 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005654
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005655 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005656
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005657 if (!single_major) {
5658 ret = register_blkdev(0, rbd_dev->name);
5659 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005660 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005661
5662 rbd_dev->major = ret;
5663 rbd_dev->minor = 0;
5664 } else {
5665 rbd_dev->major = rbd_major;
5666 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5667 }
Alex Elder83a06262012-10-30 15:47:17 -05005668
5669 /* Set up the blkdev mapping. */
5670
5671 ret = rbd_init_disk(rbd_dev);
5672 if (ret)
5673 goto err_out_blkdev;
5674
Alex Elderf35a4de2013-05-06 09:51:29 -05005675 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005676 if (ret)
5677 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005678
Alex Elderf35a4de2013-05-06 09:51:29 -05005679 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005680 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005681
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005682 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005683 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005684 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005685
Alex Elder129b79d2013-04-26 15:44:36 -05005686 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005687 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005688 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005689
Alex Elderf35a4de2013-05-06 09:51:29 -05005690err_out_mapping:
5691 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005692err_out_disk:
5693 rbd_free_disk(rbd_dev);
5694err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005695 if (!single_major)
5696 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005697err_out_unlock:
5698 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005699 return ret;
5700}
5701
Alex Elder332bb122013-04-27 09:59:30 -05005702static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5703{
5704 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005705 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005706
5707 /* Record the header object name for this rbd image. */
5708
5709 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005710 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005711 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5712 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005713 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005714 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5715 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005716
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005717 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005718}
5719
Alex Elder200a6a82013-04-28 23:32:34 -05005720static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5721{
Alex Elder6fd48b32013-04-28 23:32:34 -05005722 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005723 if (rbd_dev->opts)
5724 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005725 rbd_dev->image_format = 0;
5726 kfree(rbd_dev->spec->image_id);
5727 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005728}
5729
Alex Eldera30b71b2012-07-10 20:30:11 -05005730/*
5731 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005732 * device. If this image is the one being mapped (i.e., not a
5733 * parent), initiate a watch on its header object before using that
5734 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005735 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005736static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005737{
5738 int ret;
5739
5740 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005741 * Get the id from the image id object. Unless there's an
5742 * error, rbd_dev->spec->image_id will be filled in with
5743 * a dynamically-allocated string, and rbd_dev->image_format
5744 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005745 */
5746 ret = rbd_dev_image_id(rbd_dev);
5747 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005748 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005749
Alex Elder332bb122013-04-27 09:59:30 -05005750 ret = rbd_dev_header_name(rbd_dev);
5751 if (ret)
5752 goto err_out_format;
5753
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005754 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005755 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005756 if (ret) {
5757 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005758 pr_info("image %s/%s%s%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005759 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005760 rbd_dev->spec->pool_ns ?: "",
5761 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005762 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005763 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005764 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005765 }
Alex Elderb644de22013-04-27 09:59:31 -05005766
Ilya Dryomova720ae02014-07-23 17:11:19 +04005767 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005768 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005769 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005770
Ilya Dryomov04077592014-07-23 17:11:20 +04005771 /*
5772 * If this image is the one being mapped, we have pool name and
5773 * id, image name and id, and snap name - need to fill snap id.
5774 * Otherwise this is a parent image, identified by pool, image
5775 * and snap ids - need to fill in names for those ids.
5776 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005777 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005778 ret = rbd_spec_fill_snap_id(rbd_dev);
5779 else
5780 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005781 if (ret) {
5782 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005783 pr_info("snap %s/%s%s%s@%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005784 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005785 rbd_dev->spec->pool_ns ?: "",
5786 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005787 rbd_dev->spec->image_name,
5788 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005789 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005790 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005791
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005792 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5793 ret = rbd_dev_v2_parent_info(rbd_dev);
5794 if (ret)
5795 goto err_out_probe;
5796
5797 /*
5798 * Need to warn users if this image is the one being
5799 * mapped and has a parent.
5800 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005801 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005802 rbd_warn(rbd_dev,
5803 "WARNING: kernel layering is EXPERIMENTAL!");
5804 }
5805
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005806 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005807 if (ret)
5808 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005809
Alex Elder30d60ba2013-05-06 09:51:30 -05005810 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005811 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005812 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005813
Alex Elder6fd48b32013-04-28 23:32:34 -05005814err_out_probe:
5815 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005816err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005817 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005818 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005819err_out_format:
5820 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005821 kfree(rbd_dev->spec->image_id);
5822 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005823 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005824}
5825
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005826static ssize_t do_rbd_add(struct bus_type *bus,
5827 const char *buf,
5828 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005829{
Alex Eldercb8627c2012-07-09 21:04:23 -05005830 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005831 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005832 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005833 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005834 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005835 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005836
5837 if (!try_module_get(THIS_MODULE))
5838 return -ENODEV;
5839
Alex Eldera725f65e2012-02-02 08:13:30 -06005840 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005841 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005842 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005843 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06005844
Alex Elder9d3997f2012-10-25 23:34:42 -05005845 rbdc = rbd_get_client(ceph_opts);
5846 if (IS_ERR(rbdc)) {
5847 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005848 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005849 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005850
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005851 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01005852 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005853 if (rc < 0) {
5854 if (rc == -ENOENT)
5855 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005856 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005857 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05005858 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005859
Ilya Dryomovd1475432015-06-22 13:24:48 +03005860 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005861 if (!rbd_dev) {
5862 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05005863 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005864 }
Alex Elderc53d5892012-10-25 23:34:42 -05005865 rbdc = NULL; /* rbd_dev now owns this */
5866 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03005867 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005868
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005869 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5870 if (!rbd_dev->config_info) {
5871 rc = -ENOMEM;
5872 goto err_out_rbd_dev;
5873 }
5874
Ilya Dryomov811c6682016-04-15 16:22:16 +02005875 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005876 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005877 if (rc < 0) {
5878 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05005879 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005880 }
Alex Elder05fd6f62012-08-29 17:11:07 -05005881
Alex Elder7ce4eef2013-05-06 17:40:33 -05005882 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05005883 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02005884 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05005885
Alex Elderb536f692013-04-28 23:32:34 -05005886 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005887 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005888 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05005889
Ilya Dryomove010dd02017-04-13 12:17:39 +02005890 if (rbd_dev->opts->exclusive) {
5891 rc = rbd_add_acquire_lock(rbd_dev);
5892 if (rc)
5893 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05005894 }
5895
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005896 /* Everything's ready. Announce the disk to the world. */
5897
5898 rc = device_add(&rbd_dev->dev);
5899 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02005900 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005901
5902 add_disk(rbd_dev->disk);
5903 /* see rbd_init_disk() */
5904 blk_put_queue(rbd_dev->disk->queue);
5905
5906 spin_lock(&rbd_dev_list_lock);
5907 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5908 spin_unlock(&rbd_dev_list_lock);
5909
5910 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5911 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5912 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005913 rc = count;
5914out:
5915 module_put(THIS_MODULE);
5916 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05005917
Ilya Dryomove010dd02017-04-13 12:17:39 +02005918err_out_image_lock:
5919 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005920err_out_device_setup:
5921 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005922err_out_image_probe:
5923 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005924err_out_rbd_dev:
5925 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005926err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005927 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005928err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05005929 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03005930 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005931 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005932}
5933
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005934static ssize_t rbd_add(struct bus_type *bus,
5935 const char *buf,
5936 size_t count)
5937{
5938 if (single_major)
5939 return -EINVAL;
5940
5941 return do_rbd_add(bus, buf, count);
5942}
5943
5944static ssize_t rbd_add_single_major(struct bus_type *bus,
5945 const char *buf,
5946 size_t count)
5947{
5948 return do_rbd_add(bus, buf, count);
5949}
5950
Alex Elder05a46af2013-04-26 15:44:36 -05005951static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5952{
Alex Elderad945fc2013-04-26 15:44:36 -05005953 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005954 struct rbd_device *first = rbd_dev;
5955 struct rbd_device *second = first->parent;
5956 struct rbd_device *third;
5957
5958 /*
5959 * Follow to the parent with no grandparent and
5960 * remove it.
5961 */
5962 while (second && (third = second->parent)) {
5963 first = second;
5964 second = third;
5965 }
Alex Elderad945fc2013-04-26 15:44:36 -05005966 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005967 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005968 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005969 first->parent = NULL;
5970 first->parent_overlap = 0;
5971
5972 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005973 rbd_spec_put(first->parent_spec);
5974 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005975 }
5976}
5977
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005978static ssize_t do_rbd_remove(struct bus_type *bus,
5979 const char *buf,
5980 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005981{
5982 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05005983 struct list_head *tmp;
5984 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02005985 char opt_buf[6];
Mike Christie0276dca2016-08-18 18:38:45 +02005986 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05005987 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005988
Mike Christie0276dca2016-08-18 18:38:45 +02005989 dev_id = -1;
5990 opt_buf[0] = '\0';
5991 sscanf(buf, "%d %5s", &dev_id, opt_buf);
5992 if (dev_id < 0) {
5993 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005994 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02005995 }
5996 if (opt_buf[0] != '\0') {
5997 if (!strcmp(opt_buf, "force")) {
5998 force = true;
5999 } else {
6000 pr_err("bad remove option at '%s'\n", opt_buf);
6001 return -EINVAL;
6002 }
6003 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006004
Alex Elder751cc0e2013-05-31 15:17:01 -05006005 ret = -ENOENT;
6006 spin_lock(&rbd_dev_list_lock);
6007 list_for_each(tmp, &rbd_dev_list) {
6008 rbd_dev = list_entry(tmp, struct rbd_device, node);
6009 if (rbd_dev->dev_id == dev_id) {
6010 ret = 0;
6011 break;
6012 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006013 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006014 if (!ret) {
6015 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006016 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006017 ret = -EBUSY;
Ilya Dryomov99725532019-01-08 19:47:38 +01006018 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6019 &rbd_dev->flags))
6020 ret = -EINPROGRESS;
Alex Elder751cc0e2013-05-31 15:17:01 -05006021 spin_unlock_irq(&rbd_dev->lock);
6022 }
6023 spin_unlock(&rbd_dev_list_lock);
Ilya Dryomov99725532019-01-08 19:47:38 +01006024 if (ret)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006025 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006026
Mike Christie0276dca2016-08-18 18:38:45 +02006027 if (force) {
6028 /*
6029 * Prevent new IO from being queued and wait for existing
6030 * IO to complete/fail.
6031 */
6032 blk_mq_freeze_queue(rbd_dev->disk->queue);
6033 blk_set_queue_dying(rbd_dev->disk->queue);
6034 }
6035
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006036 del_gendisk(rbd_dev->disk);
6037 spin_lock(&rbd_dev_list_lock);
6038 list_del_init(&rbd_dev->node);
6039 spin_unlock(&rbd_dev_list_lock);
6040 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006041
Ilya Dryomove010dd02017-04-13 12:17:39 +02006042 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006043 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006044 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006045 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006046 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006047}
6048
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006049static ssize_t rbd_remove(struct bus_type *bus,
6050 const char *buf,
6051 size_t count)
6052{
6053 if (single_major)
6054 return -EINVAL;
6055
6056 return do_rbd_remove(bus, buf, count);
6057}
6058
6059static ssize_t rbd_remove_single_major(struct bus_type *bus,
6060 const char *buf,
6061 size_t count)
6062{
6063 return do_rbd_remove(bus, buf, count);
6064}
6065
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006066/*
6067 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006068 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006069 */
6070static int rbd_sysfs_init(void)
6071{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006072 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006073
Alex Elderfed4c142012-02-07 12:03:36 -06006074 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006075 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006076 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006077
Alex Elderfed4c142012-02-07 12:03:36 -06006078 ret = bus_register(&rbd_bus_type);
6079 if (ret < 0)
6080 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006081
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006082 return ret;
6083}
6084
6085static void rbd_sysfs_cleanup(void)
6086{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006087 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006088 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006089}
6090
Alex Elder1c2a9df2013-05-01 12:43:03 -05006091static int rbd_slab_init(void)
6092{
6093 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006094 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006095 if (!rbd_img_request_cache)
6096 return -ENOMEM;
6097
6098 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006099 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006100 if (!rbd_obj_request_cache)
6101 goto out_err;
6102
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006103 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006104
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006105out_err:
Alex Elder868311b2013-05-01 12:43:03 -05006106 kmem_cache_destroy(rbd_img_request_cache);
6107 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006108 return -ENOMEM;
6109}
6110
6111static void rbd_slab_exit(void)
6112{
Alex Elder868311b2013-05-01 12:43:03 -05006113 rbd_assert(rbd_obj_request_cache);
6114 kmem_cache_destroy(rbd_obj_request_cache);
6115 rbd_obj_request_cache = NULL;
6116
Alex Elder1c2a9df2013-05-01 12:43:03 -05006117 rbd_assert(rbd_img_request_cache);
6118 kmem_cache_destroy(rbd_img_request_cache);
6119 rbd_img_request_cache = NULL;
6120}
6121
Alex Eldercc344fa2013-02-19 12:25:56 -06006122static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006123{
6124 int rc;
6125
Alex Elder1e32d342013-01-30 11:13:33 -06006126 if (!libceph_compatible(NULL)) {
6127 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006128 return -EINVAL;
6129 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006130
Alex Elder1c2a9df2013-05-01 12:43:03 -05006131 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006132 if (rc)
6133 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006134
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006135 /*
6136 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006137 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006138 */
6139 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6140 if (!rbd_wq) {
6141 rc = -ENOMEM;
6142 goto err_out_slab;
6143 }
6144
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006145 if (single_major) {
6146 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6147 if (rbd_major < 0) {
6148 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006149 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006150 }
6151 }
6152
Alex Elder1c2a9df2013-05-01 12:43:03 -05006153 rc = rbd_sysfs_init();
6154 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006155 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006156
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006157 if (single_major)
6158 pr_info("loaded (major %d)\n", rbd_major);
6159 else
6160 pr_info("loaded\n");
6161
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006162 return 0;
6163
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006164err_out_blkdev:
6165 if (single_major)
6166 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006167err_out_wq:
6168 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006169err_out_slab:
6170 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006171 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006172}
6173
Alex Eldercc344fa2013-02-19 12:25:56 -06006174static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006175{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006176 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006177 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006178 if (single_major)
6179 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006180 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006181 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006182}
6183
6184module_init(rbd_init);
6185module_exit(rbd_exit);
6186
Alex Elderd552c612013-05-31 20:13:09 -05006187MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006188MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6189MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006190/* following authorship retained from original osdblk.c */
6191MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6192
Ilya Dryomov90da2582013-12-13 15:28:56 +02006193MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006194MODULE_LICENSE("GPL");